storcon: implement graceful leader cutover

storcon: add start-up sequence utilities
storcon: refactor building of observed state at start-up
2026-05-25 09:00:37 +00:00 · 2024-07-30 17:58:18 +01:00 · 2024-07-30 17:58:17 +01:00 · 2024-07-30 17:57:09 +01:00 · 2024-07-30 17:57:09 +01:00 · 2024-07-30 17:57:08 +01:00
33 changed files with 1023 additions and 2432 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -19,10 +19,6 @@ on:
        description: 'debug or release'
        required: true
        type: string
-      pg-versions:
-        description: 'a json array of postgres versions to run regression tests on'
-        required: true
-        type: string

 defaults:
  run:
@@ -258,7 +254,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pg_version: ${{ fromJson(inputs.pg-versions) }}
+        pg_version: [ v14, v15, v16 ]
    steps:
      - uses: actions/checkout@v4
        with:
@@ -288,5 +284,5 @@ jobs:
      - name: Merge and upload coverage data
        if: |
          false &&
-          inputs.build-type == 'debug' && matrix.pg_version == 'v16'
+          inputs.build-type == 'debug' && matrix.pg_version == 'v14'
        uses: ./.github/actions/save-coverage-data
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -203,8 +203,7 @@ jobs:
      fail-fast: false
      matrix:
        arch: [ x64 ]
-        # Do not build or run tests in debug for release branches
-        build-type: ${{ fromJson((startsWith(github.ref_name, 'release' && github.event_name == 'push')) && '["release"]' || '["debug", "release"]') }}
+        build-type: [ debug, release ]
        include:
          - build-type: release
            arch: arm64
@@ -214,8 +213,6 @@ jobs:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
      build-tag: ${{ needs.tag.outputs.build-tag }}
      build-type: ${{ matrix.build-type }}
-      # Run tests on all Postgres versions in release builds and only on the latest version in debug builds
-      pg-versions: ${{ matrix.build-type == 'release' && '["v14", "v15", "v16"]' || '["v16"]' }}
    secrets: inherit

  # Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
@@ -309,7 +306,7 @@ jobs:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

  create-test-report:
-    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]
+    needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
    if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
    outputs:
      report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -871,7 +868,7 @@ jobs:
        with:
          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

      - name: Login to ACR
        if: github.ref_name == 'main'
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1744,6 +1744,18 @@ dependencies = [
 "const-random",
 ]

+[[package]]
+name = "dns-lookup"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "socket2 0.5.5",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "dsl_auto_type"
 version = "0.1.1"
@@ -5724,6 +5736,7 @@ dependencies = [
 "control_plane",
 "diesel",
 "diesel_migrations",
+ "dns-lookup",
 "fail",
 "futures",
 "git-version",
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -144,7 +144,6 @@ impl RemotePath {
 ///
 /// The WithDelimiter mode will populate `prefixes` and `keys` in the result.  The
 /// NoDelimiter mode will only populate `keys`.
-#[derive(Copy, Clone)]
 pub enum ListingMode {
    WithDelimiter,
    NoDelimiter,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -17,9 +17,11 @@ use pageserver::config::PageserverIdentity;
 use pageserver::control_plane_client::ControlPlaneClient;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
-use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use pageserver::tenant::{secondary, TenantSharedResources};
-use pageserver::{CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener};
+use pageserver::{
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
+};
 use remote_storage::GenericRemoteStorage;
 use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
@@ -29,9 +31,11 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
    config::PageServerConf,
+    context::{DownloadBehavior, RequestContext},
    deletion_queue::DeletionQueue,
    http, page_cache, page_service, task_mgr,
-    task_mgr::{BACKGROUND_RUNTIME, MGMT_REQUEST_RUNTIME},
+    task_mgr::TaskKind,
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
    tenant::mgr,
    virtual_file,
 };
@@ -125,7 +129,6 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.get_impl, "starting with get page implementation");
    info!(?conf.get_vectored_impl, "starting with vectored get page implementation");
-    info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
@@ -590,13 +593,30 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let libpq_listener = {
+        let cancel = CancellationToken::new();
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+
+        let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+            "libpq listener",
+            page_service::libpq_listener_main(
+                tenant_manager.clone(),
+                pg_auth,
+                pageserver_listener,
+                conf.pg_auth_type,
+                libpq_ctx,
+                cancel.clone(),
+            ),
+        ));
+        LibpqEndpointListener(CancellableTask { task, cancel })
+    };

    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

@@ -624,7 +644,7 @@ fn start_pageserver(
            shutdown_pageserver.take();
            pageserver::shutdown_pageserver(
                http_endpoint_listener,
-                page_service,
+                libpq_listener,
                consumption_metrics_tasks,
                disk_usage_eviction_task,
                &tenant_manager,
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -29,7 +29,6 @@ use utils::{
    logging::LogFormat,
 };

-use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -296,10 +295,6 @@ pub struct PageServerConf {
    pub ephemeral_bytes_per_memory_kb: usize,

    pub l0_flush: L0FlushConfig,
-
-    /// This flag is temporary and will be removed after gradual rollout.
-    /// See <https://github.com/neondatabase/neon/issues/8184>.
-    pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -406,8 +401,6 @@ struct PageServerConfigBuilder {
    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,

    l0_flush: BuilderValue<L0FlushConfig>,
-
-    compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
 }

 impl PageServerConfigBuilder {
@@ -497,7 +490,6 @@ impl PageServerConfigBuilder {
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
            l0_flush: Set(L0FlushConfig::default()),
-            compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
        }
    }
 }
@@ -681,10 +673,6 @@ impl PageServerConfigBuilder {
        self.l0_flush = BuilderValue::Set(value);
    }

-    pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) {
-        self.compact_level0_phase1_value_access = BuilderValue::Set(value);
-    }
-
    pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -742,7 +730,6 @@ impl PageServerConfigBuilder {
                image_compression,
                ephemeral_bytes_per_memory_kb,
                l0_flush,
-                compact_level0_phase1_value_access,
            }
            CUSTOM LOGIC
            {
@@ -1015,9 +1002,6 @@ impl PageServerConf {
                "l0_flush" => {
                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
                }
-                "compact_level0_phase1_value_access" => {
-                    builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?)
-                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1102,7 +1086,6 @@ impl PageServerConf {
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
            l0_flush: L0FlushConfig::default(),
-            compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
        }
    }
 }
@@ -1344,7 +1327,6 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1419,7 +1401,6 @@ background_task_maximum_delay = '334 s'
                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
                l0_flush: L0FlushConfig::default(),
-                compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -296,11 +296,6 @@ impl From<GetActiveTenantError> for ApiError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => {
                ApiError::ResourceUnavailable(format!("{}", e).into())
            }
-            GetActiveTenantError::SwitchedTenant => {
-                // in our HTTP handlers, this error doesn't happen
-                // TODO: separate error types
-                ApiError::ResourceUnavailable("switched tenant".into())
-            }
        }
    }
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,8 +12,6 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
-
-use futures::{stream::FuturesUnordered, StreamExt};
 pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 pub mod aux_file;
@@ -32,13 +30,14 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tenant::{
    mgr::{BackgroundPurges, TenantManager},
    secondary,
 };
-use tracing::{info, info_span};
+use tracing::info;

 /// Current storage format version
 ///
@@ -64,6 +63,7 @@ pub struct CancellableTask {
    pub cancel: CancellationToken,
 }
 pub struct HttpEndpointListener(pub CancellableTask);
+pub struct LibpqEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
 impl CancellableTask {
@@ -77,7 +77,7 @@ impl CancellableTask {
 #[allow(clippy::too_many_arguments)]
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
-    page_service: page_service::Listener,
+    libpq_listener: LibpqEndpointListener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -87,83 +87,10 @@ pub async fn shutdown_pageserver(
    exit_code: i32,
 ) {
    use std::time::Duration;
-
-    // If the orderly shutdown below takes too long, we still want to make
-    // sure that all walredo processes are killed and wait()ed on by us, not systemd.
-    //
-    // (Leftover walredo processes are the hypothesized trigger for the systemd freezes
-    //  that we keep seeing in prod => https://github.com/neondatabase/cloud/issues/11387.
-    //
-    // We use a thread instead of a tokio task because the background runtime is likely busy
-    // with the final flushing / uploads. This activity here has priority, and due to lack
-    // of scheduling priority feature sin the tokio scheduler, using a separate thread is
-    // an effective priority booster.
-    let walredo_extraordinary_shutdown_thread_span = {
-        let span = info_span!(parent: None, "walredo_extraordinary_shutdown_thread");
-        span.follows_from(tracing::Span::current());
-        span
-    };
-    let walredo_extraordinary_shutdown_thread_cancel = CancellationToken::new();
-    let walredo_extraordinary_shutdown_thread = std::thread::spawn({
-        let walredo_extraordinary_shutdown_thread_cancel =
-            walredo_extraordinary_shutdown_thread_cancel.clone();
-        move || {
-            let rt = tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()
-                .unwrap();
-            let _entered = rt.enter();
-            let _entered = walredo_extraordinary_shutdown_thread_span.enter();
-            if let Ok(()) = rt.block_on(tokio::time::timeout(
-                Duration::from_secs(8),
-                walredo_extraordinary_shutdown_thread_cancel.cancelled(),
-            )) {
-                info!("cancellation requested");
-                return;
-            }
-            let managers = tenant::WALREDO_MANAGERS
-                .lock()
-                .unwrap()
-                // prevents new walredo managers from being inserted
-                .take()
-                .expect("only we take()");
-            // Use FuturesUnordered to get in queue early for each manager's
-            // heavier_once_cell semaphore wait list.
-            // Also, for idle tenants that for some reason haven't
-            // shut down yet, it's quite likely that we're not going
-            // to get Poll::Pending once.
-            let mut futs: FuturesUnordered<_> = managers
-                .into_iter()
-                .filter_map(|(_, mgr)| mgr.upgrade())
-                .map(|mgr| async move { tokio::task::unconstrained(mgr.shutdown()).await })
-                .collect();
-            info!(count=%futs.len(), "built FuturesUnordered");
-            let mut last_log_at = std::time::Instant::now();
-            #[derive(Debug, Default)]
-            struct Results {
-                initiated: u64,
-                already: u64,
-            }
-            let mut results = Results::default();
-            while let Some(we_initiated) = rt.block_on(futs.next()) {
-                if we_initiated {
-                    results.initiated += 1;
-                } else {
-                    results.already += 1;
-                }
-                if last_log_at.elapsed() > Duration::from_millis(100) {
-                    info!(remaining=%futs.len(), ?results, "progress");
-                    last_log_at = std::time::Instant::now();
-                }
-            }
-            info!(?results, "done");
-        }
-    });
-
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    let remaining_connections = timed(
-        page_service.stop_accepting(),
+    timed(
+        libpq_listener.0.shutdown(),
        "shutdown LibpqEndpointListener",
        Duration::from_secs(1),
    )
@@ -181,7 +108,7 @@ pub async fn shutdown_pageserver(
    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
    // should already have been canclled via mgr::shutdown_all_tenants
    timed(
-        remaining_connections.shutdown(),
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
        "shutdown PageRequestHandlers",
        Duration::from_secs(1),
    )
@@ -235,12 +162,6 @@ pub async fn shutdown_pageserver(
        Duration::from_secs(1),
    )
    .await;
-
-    info!("cancel & join walredo_extraordinary_shutdown_thread");
-    walredo_extraordinary_shutdown_thread_cancel.cancel();
-    walredo_extraordinary_shutdown_thread.join().unwrap();
-    info!("walredo_extraordinary_shutdown_thread done");
-
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -8,7 +8,8 @@ use std::time::Duration;
 pub use pageserver_api::key::{Key, KEY_SIZE};

 /// A 'value' stored for a one Key.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(test, derive(PartialEq))]
 pub enum Value {
    /// An Image value contains a full copy of the value
    Image(Bytes),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,7 +33,6 @@ use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
 use std::collections::BTreeMap;
 use std::fmt;
-use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
@@ -313,66 +312,14 @@ impl std::fmt::Debug for Tenant {
 }

 pub(crate) enum WalRedoManager {
-    Prod(WalredoManagerId, PostgresRedoManager),
+    Prod(PostgresRedoManager),
    #[cfg(test)]
    Test(harness::TestRedoManager),
 }

-#[derive(thiserror::Error, Debug)]
-#[error("pageserver is shutting down")]
-pub(crate) struct GlobalShutDown;
-
-impl WalRedoManager {
-    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
-        let id = WalredoManagerId::next();
-        let arc = Arc::new(Self::Prod(id, mgr));
-        let mut guard = WALREDO_MANAGERS.lock().unwrap();
-        match &mut *guard {
-            Some(map) => {
-                map.insert(id, Arc::downgrade(&arc));
-                Ok(arc)
-            }
-            None => Err(GlobalShutDown),
-        }
-    }
-}
-
-impl Drop for WalRedoManager {
-    fn drop(&mut self) {
-        match self {
-            Self::Prod(id, _) => {
-                let mut guard = WALREDO_MANAGERS.lock().unwrap();
-                if let Some(map) = &mut *guard {
-                    map.remove(id).expect("new() registers, drop() unregisters");
-                }
-            }
-            #[cfg(test)]
-            Self::Test(_) => {
-                // Not applicable to test redo manager
-            }
-        }
-    }
-}
-
-/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
-/// the walredo processes outside of the regular order.
-///
-/// This is necessary to work around a systemd bug where it freezes if there are
-/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
-#[allow(clippy::type_complexity)]
-pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
-    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
-> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
-#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
-pub(crate) struct WalredoManagerId(u64);
-impl WalredoManagerId {
-    pub fn next() -> Self {
-        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
-        }
-        Self(id)
+impl From<PostgresRedoManager> for WalRedoManager {
+    fn from(mgr: PostgresRedoManager) -> Self {
+        Self::Prod(mgr)
    }
 }

@@ -384,20 +331,19 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }

 impl WalRedoManager {
-    pub(crate) async fn shutdown(&self) -> bool {
+    pub(crate) async fn shutdown(&self) {
        match self {
-            Self::Prod(_, mgr) => mgr.shutdown().await,
+            Self::Prod(mgr) => mgr.shutdown().await,
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
-                true
            }
        }
    }

    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
-            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
+            Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
@@ -417,7 +363,7 @@ impl WalRedoManager {
        pg_version: u32,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
-            Self::Prod(_, mgr) => {
+            Self::Prod(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
@@ -431,7 +377,7 @@ impl WalRedoManager {

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(_, m) => Some(m.status()),
+            WalRedoManager::Prod(m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -440,8 +386,6 @@ impl WalRedoManager {

 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
-    #[error("Timeline is shutting down")]
-    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
@@ -731,9 +675,11 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
-        let wal_redo_manager =
-            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;
+    ) -> Arc<Tenant> {
+        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
+            conf,
+            tenant_shard_id,
+        )));

        let TenantSharedResources {
            broker_client,
@@ -932,7 +878,7 @@ impl Tenant {
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
-        Ok(tenant)
+        tenant
    }

    #[instrument(skip_all)]
@@ -7401,7 +7347,6 @@ mod tests {
                Lsn(0x60),
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
-                None,
            )
            .await
            .unwrap();
@@ -7526,7 +7471,7 @@ mod tests {
            ),
        ];
        let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3)
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
@@ -7572,114 +7517,6 @@ mod tests {
        };
        assert_eq!(res, expected_res);

-        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
-        // the ancestor image in the test case.
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![(
-                Lsn(0x60),
-                KeyLogAtLsn(vec![(
-                    Lsn(0x60),
-                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
-                )]),
-            )],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
-        let history = vec![
-            (
-                key,
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-            ),
-            (
-                key,
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
-            ),
-            (
-                key,
-                Lsn(0x60),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
-            ),
-            (
-                key,
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            ),
-        ];
-        let res = tline
-            .generate_key_retention(
-                key,
-                &history,
-                Lsn(0x60),
-                &[Lsn(0x30)],
-                3,
-                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
-            )
-            .await
-            .unwrap();
-        let expected_res = KeyHistoryRetention {
-            below_horizon: vec![
-                (
-                    Lsn(0x30),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x20),
-                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
-                    )]),
-                ),
-                (
-                    Lsn(0x60),
-                    KeyLogAtLsn(vec![(
-                        Lsn(0x60),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
-                    )]),
-                ),
-            ],
-            above_horizon: KeyLogAtLsn(vec![(
-                Lsn(0x70),
-                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
-            )]),
-        };
-        assert_eq!(res, expected_res);
-
        Ok(())
    }

@@ -7878,186 +7715,4 @@ mod tests {

        Ok(())
    }
-
-    #[tokio::test]
-    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
-        let (tenant, ctx) = harness.load().await;
-
-        fn get_key(id: u32) -> Key {
-            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
-            key.field6 = id;
-            key
-        }
-
-        let img_layer = (0..10)
-            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
-            .collect_vec();
-
-        let delta1 = vec![
-            (
-                get_key(1),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(2),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x28),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x30),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
-            ),
-            (
-                get_key(3),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
-            ),
-        ];
-        let delta2 = vec![
-            (
-                get_key(5),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-            (
-                get_key(6),
-                Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
-            ),
-        ];
-        let delta3 = vec![
-            (
-                get_key(8),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-            (
-                get_key(9),
-                Lsn(0x48),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
-            ),
-        ];
-
-        let parent_tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![],                       // delta layers
-                vec![(Lsn(0x18), img_layer)], // image layers
-                Lsn(0x18),
-            )
-            .await?;
-
-        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        let branch_tline = tenant
-            .branch_timeline_test_with_layers(
-                &parent_tline,
-                NEW_TIMELINE_ID,
-                Some(Lsn(0x18)),
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![], // image layers
-                Lsn(0x50),
-            )
-            .await?;
-
-        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
-
-        {
-            // Update GC info
-            let mut guard = parent_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x10),
-                    space: Lsn(0x10),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        {
-            // Update GC info
-            let mut guard = branch_tline.gc_info.write().unwrap();
-            *guard = GcInfo {
-                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)],
-                cutoffs: GcCutoffs {
-                    time: Lsn(0x50),
-                    space: Lsn(0x50),
-                },
-                leases: Default::default(),
-                within_ancestor_pitr: false,
-            };
-        }
-
-        let expected_result_at_gc_horizon = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x48"),
-            Bytes::from_static(b"value 9@0x10@0x48"),
-        ];
-
-        let expected_result_at_lsn_40 = [
-            Bytes::from_static(b"value 0@0x10"),
-            Bytes::from_static(b"value 1@0x10@0x20"),
-            Bytes::from_static(b"value 2@0x10@0x30"),
-            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
-            Bytes::from_static(b"value 4@0x10"),
-            Bytes::from_static(b"value 5@0x10@0x20"),
-            Bytes::from_static(b"value 6@0x10@0x20"),
-            Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10"),
-            Bytes::from_static(b"value 9@0x10"),
-        ];
-
-        let verify_result = || async {
-            for idx in 0..10 {
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_gc_horizon[idx]
-                );
-                assert_eq!(
-                    branch_tline
-                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
-                        .await
-                        .unwrap(),
-                    &expected_result_at_lsn_40[idx]
-                );
-            }
-        };
-
-        verify_result().await;
-
-        let cancel = CancellationToken::new();
-        branch_tline.compact_with_gc(&cancel, &ctx).await.unwrap();
-
-        verify_result().await;
-
-        Ok(())
-    }
 }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -296,19 +296,13 @@ where
            let mut stack = Vec::new();
            stack.push((self.root_blk, None));
            let block_cursor = self.reader.block_cursor();
-            let mut node_buf = [0_u8; PAGE_SZ];
            while let Some((node_blknum, opt_iter)) = stack.pop() {
-                // Read the node, through the PS PageCache, into local variable `node_buf`.
-                // We could keep the page cache read guard alive, but, at the time of writing,
-                // we run quite small PS PageCache s => can't risk running out of
-                // PageCache space because this stream isn't consumed fast enough.
-                let page_read_guard = block_cursor
+                // Locate the node.
+                let node_buf = block_cursor
                    .read_blk(self.start_blk + node_blknum, ctx)
                    .await?;
-                node_buf.copy_from_slice(page_read_guard.as_ref());
-                drop(page_read_guard); // drop page cache read guard early

-                let node = OnDiskNode::deparse(&node_buf)?;
+                let node = OnDiskNode::deparse(node_buf.as_ref())?;
                let prefix_len = node.prefix_len as usize;
                let suffix_len = node.suffix_len as usize;

@@ -351,7 +345,6 @@ where
                    Either::Left(idx..node.num_children.into())
                };

-
                // idx points to the first match now. Keep going from there
                while let Some(idx) = iter.next() {
                    let key_off = idx * suffix_len;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -55,7 +55,7 @@ use utils::id::{TenantId, TimelineId};
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
 use super::timeline::detach_ancestor::PreparedTimelineDetach;
-use super::{GlobalShutDown, TenantSharedResources};
+use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -116,6 +116,8 @@ pub(crate) enum ShardSelector {
    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
    /// ignore it.
    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
    /// Pick the shard that holds this key
    Page(Key),
    /// The shard ID is known: pick the given shard
@@ -665,20 +667,17 @@ pub async fn init_tenant_mgr(
        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let shard_identity = location_conf.shard;
        let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => TenantSlot::Attached(
-                tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                )
-                .expect("global shutdown during init_tenant_mgr cannot happen"),
-            ),
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                &ctx,
+            )),
            LocationMode::Secondary(secondary_conf) => {
                info!(
                    tenant_id = %tenant_shard_id.tenant_id,
@@ -726,7 +725,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Arc<Tenant> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -1193,10 +1192,7 @@ impl TenantManager {
                    None,
                    spawn_mode,
                    ctx,
-                )
-                .map_err(|_: GlobalShutDown| {
-                    UpsertLocationError::Unavailable(TenantMapError::ShuttingDown)
-                })?;
+                );

                TenantSlot::Attached(tenant)
            }
@@ -1317,7 +1313,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -2051,7 +2047,7 @@ impl TenantManager {
            None,
            SpawnMode::Eager,
            ctx,
-        )?;
+        );

        slot_guard.upsert(TenantSlot::Attached(tenant))?;

@@ -2092,6 +2088,7 @@ impl TenantManager {
                    };

                    match selector {
+                        ShardSelector::First => return ShardResolveResult::Found(tenant.clone()),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return ShardResolveResult::Found(tenant.clone())
                        }
@@ -2173,9 +2170,6 @@ pub(crate) enum GetActiveTenantError {
    /// never happen.
    #[error("Tenant is broken: {0}")]
    Broken(String),
-
-    #[error("reconnect to switch tenant id")]
-    SwitchedTenant,
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3,7 +3,6 @@ pub(crate) mod compaction;
 pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
-pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
@@ -18,7 +17,6 @@ use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
-use handle::ShardTimelineId;
 use once_cell::sync::Lazy;
 use pageserver_api::{
    key::{
@@ -76,7 +74,6 @@ use crate::{
        metadata::TimelineMetadata,
        storage_layer::PersistentLayerDesc,
    },
-    walredo,
 };
 use crate::{
    context::{DownloadBehavior, RequestContext},
@@ -427,8 +424,6 @@ pub struct Timeline {
    pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,

    pub(crate) l0_flush_global_state: L0FlushGlobalState,
-
-    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 }

 pub struct WalReceiverInfo {
@@ -534,6 +529,7 @@ impl GetVectoredError {
    }
 }

+#[derive(Debug)]
 pub struct MissingKeyError {
    key: Key,
    shard: ShardNumber,
@@ -544,12 +540,6 @@ pub struct MissingKeyError {
    backtrace: Option<std::backtrace::Backtrace>,
 }

-impl std::fmt::Debug for MissingKeyError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self)
-    }
-}
-
 impl std::fmt::Display for MissingKeyError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -1001,10 +991,7 @@ impl Timeline {
            .for_get_kind(GetKind::Singular)
            .observe(elapsed.as_secs_f64());

-        if cfg!(feature = "testing")
-            && res.is_err()
-            && !matches!(res, Err(PageReconstructError::Cancelled))
-        {
+        if cfg!(feature = "testing") && res.is_err() {
            // it can only be walredo issue
            use std::fmt::Write;

@@ -1923,9 +1910,6 @@ impl Timeline {
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

-        // Ensure Prevent new page service requests from starting.
-        self.handles.shutdown();
-
        // Transition the remote_client into a state where it's only useful for timeline deletion.
        // (The deletion use case is why we can't just hook up remote_client to Self::cancel).)
        self.remote_client.stop();
@@ -2451,8 +2435,6 @@ impl Timeline {
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

                l0_flush_global_state: resources.l0_flush_global_state,
-
-                handles: Default::default(),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -3722,17 +3704,6 @@ impl Timeline {
        &self.shard_identity
    }

-    #[inline(always)]
-    pub(crate) fn shard_timeline_id(&self) -> ShardTimelineId {
-        ShardTimelineId {
-            shard_index: ShardIndex {
-                shard_number: self.shard_identity.number,
-                shard_count: self.shard_identity.count,
-            },
-            timeline_id: self.timeline_id,
-        }
-    }
-
    ///
    /// Get a handle to the latest layer for appending.
    ///
@@ -5470,22 +5441,20 @@ impl Timeline {
                } else {
                    trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
                };
-                let res = self
+
+                let img = match self
                    .walredo_mgr
                    .as_ref()
                    .context("timeline has no walredo manager")
                    .map_err(PageReconstructError::WalRedo)?
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
-                    .await;
-                let img = match res {
+                    .await
+                    .context("reconstruct a page image")
+                {
                    Ok(img) => img,
-                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
-                    Err(walredo::Error::Other(e)) => {
-                        return Err(PageReconstructError::WalRedo(
-                            e.context("reconstruct a page image"),
-                        ))
-                    }
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };
+
                Ok(img)
            }
        }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -15,7 +15,6 @@ use super::{
 };

 use anyhow::{anyhow, Context};
-use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
@@ -70,21 +69,17 @@ impl KeyHistoryRetention {
        self,
        key: Key,
        delta_writer: &mut Vec<(Key, Lsn, Value)>,
-        mut image_writer: Option<&mut ImageLayerWriter>,
+        image_writer: &mut ImageLayerWriter,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut first_batch = true;
-        for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
+        for (_, KeyLogAtLsn(logs)) in self.below_horizon {
            if first_batch {
                if logs.len() == 1 && logs[0].1.is_image() {
                    let Value::Image(img) = &logs[0].1 else {
                        unreachable!()
                    };
-                    if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer.put_image(key, img.clone(), ctx).await?;
-                    } else {
-                        delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
-                    }
+                    image_writer.put_image(key, img.clone(), ctx).await?;
                } else {
                    for (lsn, val) in logs {
                        delta_writer.push((key, lsn, val));
@@ -703,140 +698,7 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        // If there's both a Value::Image and Value::WalRecord for the same (key,lsn),
-        // then the Value::Image is ordered before Value::WalRecord.
-        //
-        // TODO(https://github.com/neondatabase/neon/issues/8184): remove the page cached blob_io
-        // option and validation code once we've reached confidence.
-        enum AllValuesIter<'a> {
-            PageCachedBlobIo {
-                all_keys_iter: VecIter<'a>,
-            },
-            StreamingKmergeBypassingPageCache {
-                merge_iter: MergeIterator<'a>,
-            },
-            ValidatingStreamingKmergeBypassingPageCache {
-                mode: CompactL0BypassPageCacheValidation,
-                merge_iter: MergeIterator<'a>,
-                all_keys_iter: VecIter<'a>,
-            },
-        }
-        type VecIter<'a> = std::slice::Iter<'a, DeltaEntry<'a>>; // TODO: distinguished lifetimes
-        impl AllValuesIter<'_> {
-            async fn next_all_keys_iter(
-                iter: &mut VecIter<'_>,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                let Some(DeltaEntry {
-                    key,
-                    lsn,
-                    val: value_ref,
-                    ..
-                }) = iter.next()
-                else {
-                    return Ok(None);
-                };
-                let value = value_ref.load(ctx).await?;
-                Ok(Some((*key, *lsn, value)))
-            }
-            async fn next(
-                &mut self,
-                ctx: &RequestContext,
-            ) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-                match self {
-                    AllValuesIter::PageCachedBlobIo { all_keys_iter: iter } => {
-                      Self::next_all_keys_iter(iter, ctx).await
-                    }
-                    AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter } => merge_iter.next().await,
-                    AllValuesIter::ValidatingStreamingKmergeBypassingPageCache { mode, merge_iter, all_keys_iter } => async {
-                        // advance both iterators
-                        let all_keys_iter_item = Self::next_all_keys_iter(all_keys_iter, ctx).await;
-                        let merge_iter_item = merge_iter.next().await;
-                        // compare results & log warnings as needed
-                        macro_rules! rate_limited_warn {
-                            ($($arg:tt)*) => {{
-                                if cfg!(debug_assertions) || cfg!(feature = "testing") {
-                                    warn!($($arg)*);
-                                    panic!("CompactL0BypassPageCacheValidation failure, check logs");
-                                }
-                                use once_cell::sync::Lazy;
-                                use utils::rate_limit::RateLimit;
-                                use std::sync::Mutex;
-                                use std::time::Duration;
-                                static LOGGED: Lazy<Mutex<RateLimit>> =
-                                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                                let mut rate_limit = LOGGED.lock().unwrap();
-                                rate_limit.call(|| {
-                                    warn!($($arg)*);
-                                });
-                            }}
-                        }
-                        match (&all_keys_iter_item, &merge_iter_item) {
-                            (Err(_), Err(_)) => {
-                                // don't bother asserting equivality of the errors
-                            }
-                            (Err(all_keys), Ok(merge)) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned an error where merge did not: {all_keys:?}");
-                            },
-                            (Ok(all_keys), Err(merge)) => {
-                                rate_limited_warn!(?all_keys, "merge returned an error where all_keys_iter did not: {merge:?}");
-                            },
-                            (Ok(None), Ok(None)) => { }
-                            (Ok(Some(all_keys)), Ok(None)) => {
-                                rate_limited_warn!(?all_keys, "merge returned None where all_keys_iter returned Some");
-                            }
-                            (Ok(None), Ok(Some(merge))) => {
-                                rate_limited_warn!(?merge, "all_keys_iter returned None where merge returned Some");
-                            }
-                            (Ok(Some((all_keys_key, all_keys_lsn, all_keys_value))), Ok(Some((merge_key, merge_lsn, merge_value)))) => {
-                                match mode {
-                                    // TODO: in this mode, we still load the value from disk for both iterators, even though we only need the all_keys_iter one
-                                    CompactL0BypassPageCacheValidation::KeyLsn => {
-                                        let all_keys = (all_keys_key, all_keys_lsn);
-                                        let merge = (merge_key, merge_lsn);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN) than all_keys_iter");
-                                        }
-                                    }
-                                    CompactL0BypassPageCacheValidation::KeyLsnValue => {
-                                        let all_keys = (all_keys_key, all_keys_lsn, all_keys_value);
-                                        let merge = (merge_key, merge_lsn, merge_value);
-                                        if all_keys != merge {
-                                            rate_limited_warn!(?all_keys, ?merge, "merge returned a different (Key,LSN,Value) than all_keys_iter");
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                        // in case of mismatch, trust the legacy all_keys_iter_item
-                        all_keys_iter_item
-                    }.instrument(info_span!("next")).await
-                }
-            }
-        }
-        let mut all_values_iter = match &self.conf.compact_level0_phase1_value_access {
-            CompactL0Phase1ValueAccess::PageCachedBlobIo => AllValuesIter::PageCachedBlobIo {
-                all_keys_iter: all_keys.iter(),
-            },
-            CompactL0Phase1ValueAccess::StreamingKmerge { validate } => {
-                let merge_iter = {
-                    let mut deltas = Vec::with_capacity(deltas_to_compact.len());
-                    for l in deltas_to_compact.iter() {
-                        let l = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
-                        deltas.push(l);
-                    }
-                    MergeIterator::create(&deltas, &[], ctx)
-                };
-                match validate {
-                    None => AllValuesIter::StreamingKmergeBypassingPageCache { merge_iter },
-                    Some(validate) => AllValuesIter::ValidatingStreamingKmergeBypassingPageCache {
-                        mode: validate.clone(),
-                        merge_iter,
-                        all_keys_iter: all_keys.iter(),
-                    },
-                }
-            }
-        };
+        let all_values_iter = all_keys.iter();

        // This iterator walks through all keys and is needed to calculate size used by each key
        let mut all_keys_iter = all_keys
@@ -909,11 +771,11 @@ impl Timeline {
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
        let mut next_hole = 0; // index of next hole in holes vector

-        while let Some((key, lsn, value)) = all_values_iter
-            .next(ctx)
-            .await
-            .map_err(CompactionError::Other)?
+        for &DeltaEntry {
+            key, lsn, ref val, ..
+        } in all_values_iter
        {
+            let value = val.load(ctx).await.map_err(CompactionError::Other)?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
@@ -1098,10 +960,6 @@ impl Timeline {
            }
        }

-        // Without this, rustc complains about deltas_to_compact still
-        // being borrowed when we `.into_iter()` below.
-        drop(all_values_iter);
-
        Ok(CompactLevel0Phase1Result {
            new_layers,
            deltas_to_compact: deltas_to_compact
@@ -1209,43 +1067,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
    }
 }

-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-pub enum CompactL0Phase1ValueAccess {
-    /// The old way.
-    PageCachedBlobIo,
-    /// The new way.
-    StreamingKmerge {
-        /// If set, we run both the old way and the new way, validate that
-        /// they are identical (=> [`CompactL0BypassPageCacheValidation`]),
-        /// and if the validation fails,
-        /// - in tests: fail them with a panic or
-        /// - in prod, log a rate-limited warning and use the old way's results.
-        ///
-        /// If not set, we only run the new way and trust its results.
-        validate: Option<CompactL0BypassPageCacheValidation>,
-    },
-}
-
-/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`].
-#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)]
-#[serde(rename_all = "kebab-case")]
-pub enum CompactL0BypassPageCacheValidation {
-    /// Validate that the series of (key, lsn) pairs are the same.
-    KeyLsn,
-    /// Validate that the entire output of old and new way is identical.
-    KeyLsnValue,
-}
-
-impl Default for CompactL0Phase1ValueAccess {
-    fn default() -> Self {
-        CompactL0Phase1ValueAccess::StreamingKmerge {
-            // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident
-            validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue),
-        }
-    }
-}
-
 impl Timeline {
    /// Entry point for new tiered compaction algorithm.
    ///
@@ -1333,7 +1154,6 @@ impl Timeline {
        horizon: Lsn,
        retain_lsn_below_horizon: &[Lsn],
        delta_threshold_cnt: usize,
-        base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
    ) -> anyhow::Result<KeyHistoryRetention> {
        // Pre-checks for the invariants
        if cfg!(debug_assertions) {
@@ -1363,7 +1183,6 @@ impl Timeline {
                );
            }
        }
-        let has_ancestor = base_img_from_ancestor.is_some();
        // Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
        // and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
        let (mut split_history, lsn_split_points) = {
@@ -1397,9 +1216,6 @@ impl Timeline {
                        // For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
                        // keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
                        // dropped.
-                        //
-                        // TODO: in case we have both delta + images for a given LSN and it does not exceed the delta
-                        // threshold, we could have kept delta instead to save space. This is an optimization for the future.
                        continue;
                    }
                }
@@ -1417,13 +1233,9 @@ impl Timeline {
            "should have at least below + above horizon batches"
        );
        let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        if let Some((key, lsn, img)) = base_img_from_ancestor {
-            replay_history.push((key, lsn, Value::Image(img)));
-        }
        for (i, split_for_lsn) in split_history.into_iter().enumerate() {
-            // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
            records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 && !has_ancestor {
+            let generate_image = if i == 0 {
                // We always generate images for the first batch (below horizon / lowest retain_lsn)
                true
            } else if i == batch_cnt - 1 {
@@ -1546,25 +1358,20 @@ impl Timeline {
            retain_lsns_below_horizon.sort();
            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
        };
-        let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            Lsn(self.ancestor_lsn.0 + 1)
-        } else {
-            let res = retain_lsns_below_horizon
-                .first()
-                .copied()
-                .unwrap_or(gc_cutoff);
-            if cfg!(debug_assertions) {
-                assert_eq!(
-                    res,
-                    retain_lsns_below_horizon
-                        .iter()
-                        .min()
-                        .copied()
-                        .unwrap_or(gc_cutoff)
-                );
-            }
-            res
-        };
+        let lowest_retain_lsn = retain_lsns_below_horizon
+            .first()
+            .copied()
+            .unwrap_or(gc_cutoff);
+        if cfg!(debug_assertions) {
+            assert_eq!(
+                lowest_retain_lsn,
+                retain_lsns_below_horizon
+                    .iter()
+                    .min()
+                    .copied()
+                    .unwrap_or(gc_cutoff)
+            );
+        }
        info!(
            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
            layer_selection.len(),
@@ -1605,7 +1412,6 @@ impl Timeline {
        let mut accumulated_values = Vec::new();
        let mut last_key: Option<Key> = None;

-        #[allow(clippy::too_many_arguments)]
        async fn flush_deltas(
            deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
            last_key: Key,
@@ -1614,7 +1420,6 @@ impl Timeline {
            tline: &Arc<Timeline>,
            lowest_retain_lsn: Lsn,
            ctx: &RequestContext,
-            last_batch: bool,
        ) -> anyhow::Result<Option<ResidentLayer>> {
            // Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
            // overlapping layers.
@@ -1635,7 +1440,7 @@ impl Timeline {
                *current_delta_split_point += 1;
                need_split = true;
            }
-            if !need_split && !last_batch {
+            if !need_split {
                return Ok(None);
            }
            let deltas = std::mem::take(deltas);
@@ -1660,44 +1465,15 @@ impl Timeline {
            Ok(Some(delta_layer))
        }

-        // Only create image layers when there is no ancestor branches. TODO: create covering image layer
-        // when some condition meet.
-        let mut image_layer_writer = if self.ancestor_timeline.is_none() {
-            Some(
-                ImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    &(Key::MIN..Key::MAX), // covers the full key range
-                    lowest_retain_lsn,
-                    ctx,
-                )
-                .await?,
-            )
-        } else {
-            None
-        };
-
-        /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
-        ///
-        /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
-        /// is needed for reconstruction. This should be fixed in the future.
-        ///
-        /// Furthermore, we should do vectored get instead of a single get, or better, use k-merge for ancestor
-        /// images.
-        async fn get_ancestor_image(
-            tline: &Arc<Timeline>,
-            key: Key,
-            ctx: &RequestContext,
-        ) -> anyhow::Result<Option<(Key, Lsn, Bytes)>> {
-            if tline.ancestor_timeline.is_none() {
-                return Ok(None);
-            };
-            // This function is implemented as a get of the current timeline at ancestor LSN, therefore reusing
-            // as much existing code as possible.
-            let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
-            Ok(Some((key, tline.ancestor_lsn, img)))
-        }
+        let mut image_layer_writer = ImageLayerWriter::new(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            &(Key::MIN..Key::MAX), // covers the full key range
+            lowest_retain_lsn,
+            ctx,
+        )
+        .await?;

        let mut delta_values = Vec::new();
        let delta_split_points = delta_split_points.into_iter().collect_vec();
@@ -1718,17 +1494,11 @@ impl Timeline {
                        gc_cutoff,
                        &retain_lsns_below_horizon,
                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
                    )
                    .await?;
                // Put the image into the image layer. Currently we have a single big layer for the compaction.
                retention
-                    .pipe_to(
-                        *last_key,
-                        &mut delta_values,
-                        image_layer_writer.as_mut(),
-                        ctx,
-                    )
+                    .pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
                    .await?;
                delta_layers.extend(
                    flush_deltas(
@@ -1739,7 +1509,6 @@ impl Timeline {
                        self,
                        lowest_retain_lsn,
                        ctx,
-                        false,
                    )
                    .await?,
                );
@@ -1758,17 +1527,11 @@ impl Timeline {
                gc_cutoff,
                &retain_lsns_below_horizon,
                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
            )
            .await?;
        // Put the image into the image layer. Currently we have a single big layer for the compaction.
        retention
-            .pipe_to(
-                last_key,
-                &mut delta_values,
-                image_layer_writer.as_mut(),
-                ctx,
-            )
+            .pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
            .await?;
        delta_layers.extend(
            flush_deltas(
@@ -1779,25 +1542,19 @@ impl Timeline {
                self,
                lowest_retain_lsn,
                ctx,
-                true,
            )
            .await?,
        );
-        assert!(delta_values.is_empty(), "unprocessed keys");

-        let image_layer = if let Some(writer) = image_layer_writer {
-            Some(writer.finish(self, ctx).await?)
-        } else {
-            None
-        };
+        let image_layer = image_layer_writer.finish(self, ctx).await?;
        info!(
            "produced {} delta layers and {} image layers",
            delta_layers.len(),
-            if image_layer.is_some() { 1 } else { 0 }
+            1
        );
        let mut compact_to = Vec::new();
        compact_to.extend(delta_layers);
-        compact_to.extend(image_layer);
+        compact_to.push(image_layer);
        // Step 3: Place back to the layer map.
        {
            let mut guard = self.layers.write().await;
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,967 +0,0 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
-//!
-//! # Motivation
-//!
-//! On a single page service connection, we're typically serving a single TenantTimelineId.
-//!
-//! Without sharding, there is a single Timeline object to which we dispatch
-//! all requests. For example, a getpage request gets dispatched to the
-//! Timeline::get method of the Timeline object that represents the
-//! (tenant,timeline) of that connection.
-//!
-//! With sharding, for each request that comes in on the connection,
-//! we first have to perform shard routing based on the requested key (=~ page number).
-//! The result of shard routing is a Timeline object.
-//! We then dispatch the request to that Timeline object.
-//!
-//! Regardless of whether the tenant is sharded or not, we want to ensure that
-//! we hold the Timeline gate open while we're invoking the method on the
-//! Timeline object.
-//!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
-//!
-//! Regardless of how we accomplish the above, it should not
-//! prevent the Timeline from shutting down promptly.
-//!
-//! # Design
-//!
-//! There are three user-facing data structures:
-//! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
-//! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
-//!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
-//!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
-//!
-//! To dispatch a request, the page service connection calls `Cache::get`.
-//!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
-//!
-//! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
-//!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
-//!
-//! # Memory Management / How The Reference Cycle Is Broken
-//!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
-//!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
-//!
-//! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
-//!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
-//! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
-//!
-//! # Fast Path for Shard Routing
-//!
-//! The `Cache` has a fast path for shard routing to avoid calling into
-//! the tenant manager for every request.
-//!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
-//!
-//! The current implementation uses the first entry in the hash map
-//! to determine the `ShardParameters` and derive the correct
-//! `ShardIndex` for the requested key.
-//!
-//! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
-//!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
-//! it's a hit.
-//!
-//! ## Cache invalidation
-//!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
-//! The only reasons why an entry in the cache can become stale are:
-//! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
-//!    being detached, timeline or shard deleted, or pageserver is shutting down.
-//! 2. We're doing a shard split and new traffic should be routed to the child shards.
-//!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
-//! timeline has shut down, and when that happens, we remove the entry from the cache.
-//!
-//! Regarding (2), the insight is that it is toally fine to keep dispatching requests
-//! to the parent shard during a shard split. Eventually, the shard split task will
-//! shut down the parent => case (1).
-
-use std::collections::hash_map;
-use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
-use std::sync::Arc;
-use std::sync::Mutex;
-use std::sync::Weak;
-
-use pageserver_api::shard::ShardIdentity;
-use tracing::instrument;
-use tracing::trace;
-use utils::id::TimelineId;
-use utils::shard::ShardIndex;
-use utils::shard::ShardNumber;
-
-use crate::tenant::mgr::ShardSelector;
-
-/// The requirement for Debug is so that #[derive(Debug)] works in some places.
-pub(crate) trait Types: Sized + std::fmt::Debug {
-    type TenantManagerError: Sized + std::fmt::Debug;
-    type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
-}
-
-/// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
-/// Required so [`Cache::drop`] can take out the handles from the [`PerTimelineState`].
-/// Alternative to this would be to allocate [`Cache`] in a `Box` and identify it by the pointer.
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
-struct CacheId(u64);
-
-impl CacheId {
-    fn next() -> Self {
-        static NEXT_ID: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
-        let id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-        if id == 0 {
-            panic!("CacheId::new() returned 0, overflow");
-        }
-        Self(id)
-    }
-}
-
-/// See module-level comment.
-pub(crate) struct Cache<T: Types> {
-    id: CacheId,
-    map: Map<T>,
-}
-
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
-
-impl<T: Types> Default for Cache<T> {
-    fn default() -> Self {
-        Self {
-            id: CacheId::next(),
-            map: Default::default(),
-        }
-    }
-}
-
-#[derive(PartialEq, Eq, Debug, Hash, Clone, Copy)]
-pub(crate) struct ShardTimelineId {
-    pub(crate) shard_index: ShardIndex,
-    pub(crate) timeline_id: TimelineId,
-}
-
-/// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
-///
-/// See module-level comment for details.
-pub struct PerTimelineState<T: Types> {
-    // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
-}
-
-impl<T: Types> Default for PerTimelineState<T> {
-    fn default() -> Self {
-        Self {
-            handles: Mutex::new(Some(Default::default())),
-        }
-    }
-}
-
-/// Abstract view of [`crate::tenant::mgr`], for testability.
-pub(crate) trait TenantManager<T: Types> {
-    /// Invoked by [`Cache::get`] to resolve a [`ShardTimelineId`] to a [`Types::Timeline`].
-    /// Errors are returned as [`GetError::TenantManager`].
-    async fn resolve(
-        &self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> Result<T::Timeline, T::TenantManagerError>;
-}
-
-/// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
-    fn shard_timeline_id(&self) -> ShardTimelineId;
-    fn get_shard_identity(&self) -> &ShardIdentity;
-    fn per_timeline_state(&self) -> &PerTimelineState<T>;
-}
-
-/// Errors returned by [`Cache::get`].
-#[derive(Debug)]
-pub(crate) enum GetError<T: Types> {
-    TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
-    PerTimelineStateShutDown,
-}
-
-/// Internal type used in [`Cache::get`].
-enum RoutingResult<T: Types> {
-    FastPath(Handle<T>),
-    SlowPath(ShardTimelineId),
-    NeedConsultTenantManager,
-}
-
-impl<T: Types> Cache<T> {
-    /// See module-level comment for details.
-    ///
-    /// Does NOT check for the shutdown state of [`Types::Timeline`].
-    /// Instead, the methods of [`Types::Timeline`] that are invoked through
-    /// the [`Handle`] are responsible for checking these conditions
-    /// and if so, return an error that causes the page service to
-    /// close the connection.
-    #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
-            let routing_state = self.shard_routing(timeline_id, shard_selector);
-            match routing_state {
-                RoutingResult::FastPath(handle) => return Ok(handle),
-                RoutingResult::SlowPath(key) => match self.map.get(&key) {
-                    Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
-                            trace!("handle cache stale");
-                            self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
-                        }
-                    },
-                    None => ShardSelector::Known(key.shard_index),
-                },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
-            }
-        };
-        self.get_miss(timeline_id, miss, tenant_manager).await
-    }
-
-    #[inline(always)]
-    fn shard_routing(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-    ) -> RoutingResult<T> {
-        loop {
-            // terminates because when every iteration we remove an element from the map
-            let Some((first_key, first_handle)) = self.map.iter().next() else {
-                return RoutingResult::NeedConsultTenantManager;
-            };
-            let Some(first_handle) = first_handle.upgrade() else {
-                // TODO: dedup with get()
-                trace!("handle cache stale");
-                let first_key_owned = *first_key;
-                self.map.remove(&first_key_owned).unwrap();
-                continue;
-            };
-
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
-            let make_shard_index = |shard_num: ShardNumber| ShardIndex {
-                shard_number: shard_num,
-                shard_count: first_handle_shard_identity.count,
-            };
-
-            let need_idx = match shard_selector {
-                ShardSelector::Page(key) => {
-                    make_shard_index(first_handle_shard_identity.get_shard_number(&key))
-                }
-                ShardSelector::Zero => make_shard_index(ShardNumber(0)),
-                ShardSelector::Known(shard_idx) => shard_idx,
-            };
-            let need_shard_timeline_id = ShardTimelineId {
-                shard_index: need_idx,
-                timeline_id,
-            };
-            let first_handle_shard_timeline_id = ShardTimelineId {
-                shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
-            };
-
-            if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
-            } else {
-                return RoutingResult::SlowPath(need_shard_timeline_id);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    #[inline(always)]
-    async fn get_miss(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        match tenant_manager.resolve(timeline_id, shard_selector).await {
-            Ok(timeline) => {
-                let key = timeline.shard_timeline_id();
-                match &shard_selector {
-                    ShardSelector::Zero => assert_eq!(key.shard_index.shard_number, ShardNumber(0)),
-                    ShardSelector::Page(_) => (), // gotta trust tenant_manager
-                    ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
-                }
-
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
-                trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
-                    let mut lock_guard = timeline
-                        .per_timeline_state()
-                        .handles
-                        .lock()
-                        .expect("mutex poisoned");
-                    match &mut *lock_guard {
-                        Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
-                            assert!(replaced.is_none(), "some earlier code left a stale handle");
-                            match self.map.entry(key) {
-                                hash_map::Entry::Occupied(_o) => {
-                                    // This cannot not happen because
-                                    // 1. we're the _miss_ handle, i.e., `self.map` didn't contain an entry and
-                                    // 2. we were holding &mut self during .resolve().await above, so, no other thread can have inserted a handle
-                                    //    while we were waiting for the tenant manager.
-                                    unreachable!()
-                                }
-                                hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
-                                }
-                            }
-                        }
-                        None => {
-                            return Err(GetError::PerTimelineStateShutDown);
-                        }
-                    }
-                };
-                Ok(Handle(handle))
-            }
-            Err(e) => Err(GetError::TenantManager(e)),
-        }
-    }
-}
-
-impl<T: Types> PerTimelineState<T> {
-    /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
-    /// to the [`Types::Timeline`] that embeds this per-timeline state.
-    /// Even if [`TenantManager::resolve`] would still resolve to it.
-    ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
-    /// That's ok because they're short-lived. See module-level comment for details.
-    #[instrument(level = "trace", skip_all)]
-    pub(super) fn shutdown(&self) {
-        let handles = self
-            .handles
-            .lock()
-            .expect("mutex poisoned")
-            // NB: this .take() sets locked to None.
-            // That's what makes future `Cache::get` misses fail.
-            // Cache hits are taken care of below.
-            .take();
-        let Some(handles) = handles else {
-            trace!("already shut down");
-            return;
-        };
-        for handle in handles.values() {
-            // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
-        }
-        drop(handles);
-    }
-}
-
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
-// When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
-impl<T: Types> Drop for Cache<T> {
-    fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use pageserver_api::{
-        key::{rel_block_to_key, Key, DBDIR_KEY},
-        models::ShardParameters,
-        reltag::RelTag,
-        shard::ShardStripeSize,
-    };
-    use utils::shard::ShardCount;
-
-    use super::*;
-
-    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
-
-    #[derive(Debug)]
-    struct TestTypes;
-    impl Types for TestTypes {
-        type TenantManagerError = anyhow::Error;
-        type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
-    }
-
-    struct StubManager {
-        shards: Vec<Arc<StubTimeline>>,
-    }
-
-    struct StubTimeline {
-        gate: utils::sync::gate::Gate,
-        id: TimelineId,
-        shard: ShardIdentity,
-        per_timeline_state: PerTimelineState<TestTypes>,
-        myself: Weak<StubTimeline>,
-    }
-
-    impl StubTimeline {
-        fn getpage(&self) {
-            // do nothing
-        }
-    }
-
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
-        fn shard_timeline_id(&self) -> ShardTimelineId {
-            ShardTimelineId {
-                shard_index: self.shard.shard_index(),
-                timeline_id: self.id,
-            }
-        }
-
-        fn get_shard_identity(&self) -> &ShardIdentity {
-            &self.shard
-        }
-
-        fn per_timeline_state(&self) -> &PerTimelineState<TestTypes> {
-            &self.per_timeline_state
-        }
-    }
-
-    impl TenantManager<TestTypes> for StubManager {
-        async fn resolve(
-            &self,
-            timeline_id: TimelineId,
-            shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
-            for timeline in &self.shards {
-                if timeline.id == timeline_id {
-                    match &shard_selector {
-                        ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Zero => continue,
-                        ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Page(_) => continue,
-                        ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
-                        }
-                        ShardSelector::Known(_) => continue,
-                    }
-                }
-            }
-            anyhow::bail!("not found")
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_timeline_shutdown() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        //
-        // fill the cache
-        //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        let handle: Handle<_> = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
-        assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-        drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
-
-        //
-        // demonstrate that Handle holds up gate closure
-        // but shutdown prevents new handles from being handed out
-        //
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("cache and per-timeline handler state keep cache open");
-            }
-            _ = tokio::time::sleep(FOREVER) => {
-                // NB: first poll of close() makes it enter closing state
-            }
-        }
-
-        let handle = cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have the timeline");
-        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-
-        // SHUTDOWN
-        shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
-
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
-        assert_eq!(
-            cache.map.len(),
-            1,
-            "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
-
-        // this handle is perfectly usable
-        handle.getpage();
-
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown, even if there is an alive Handle");
-        assert_eq!(
-            cache.map.len(),
-            0,
-            "first access after shutdown cleans up the Weak's from the cache"
-        );
-
-        tokio::select! {
-            _ = shard0.gate.close() => {
-                panic!("handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-
-        drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
-        // closing gate succeeds after dropping handle
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-
-        // map gets cleaned on next lookup
-        cache
-            .get(timeline_id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 0);
-
-        // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
-        drop(shard0);
-        drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
-    }
-
-    #[tokio::test]
-    async fn test_multiple_timelines_and_deletion() {
-        crate::tenant::harness::setup_logging();
-
-        let timeline_a = TimelineId::generate();
-        let timeline_b = TimelineId::generate();
-        assert_ne!(timeline_a, timeline_b);
-        let timeline_a = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_a,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let timeline_b = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_b,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mut mgr = StubManager {
-            shards: vec![timeline_a.clone(), timeline_b.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we have it");
-        assert_eq!(cache.map.len(), 2);
-
-        // delete timeline A
-        timeline_a.per_timeline_state.shutdown();
-        mgr.shards.retain(|t| t.id != timeline_a.id);
-        assert!(
-            mgr.resolve(timeline_a.id, ShardSelector::Page(key))
-                .await
-                .is_err(),
-            "broken StubManager implementation"
-        );
-
-        assert_eq!(
-            cache.map.len(),
-            2,
-            "cache still has a Weak handle to Timeline A"
-        );
-        cache
-            .get(timeline_a.id, ShardSelector::Page(key), &mgr)
-            .await
-            .err()
-            .expect("documented behavior: can't get new handle after shutdown");
-        assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
-
-        cache
-            .get(timeline_b.id, ShardSelector::Page(key), &mgr)
-            .await
-            .expect("we still have it");
-    }
-
-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
-        rel_block_to_key(
-            RelTag {
-                spcnode: 1663,
-                dbnode: 208101,
-                relnode: 2620,
-                forknum: 0,
-            },
-            shard.0 as u32 * params.stripe_size.0,
-        )
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_shard_split() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let parent = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_params = ShardParameters {
-            count: ShardCount(2),
-            stripe_size: ShardStripeSize::default(),
-        };
-        let child0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child1 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let child_shards_by_shard_number = [child0.clone(), child1.clone()];
-
-        let mut cache = Cache::<TestTypes>::default();
-
-        // fill the cache with the parent
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![parent.clone()],
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent first"
-            );
-            drop(handle);
-        }
-
-        //
-        // SHARD SPLIT: tenant manager changes, but the cache isn't informed
-        //
-
-        // while we haven't shut down the parent, the cache will return the cached parent, even
-        // if the tenant manager returns the child
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(&handle.myself, &parent.myself),
-                "mgr returns parent"
-            );
-            drop(handle);
-        }
-
-        let parent_handle = cache
-            .get(
-                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
-                &StubManager {
-                    shards: vec![parent.clone()],
-                },
-            )
-            .await
-            .expect("we have it");
-        assert!(Weak::ptr_eq(&parent_handle.myself, &parent.myself));
-
-        // invalidate the cache
-        parent.per_timeline_state.shutdown();
-
-        // the cache will now return the child, even though the parent handle still exists
-        for i in 0..2 {
-            let handle = cache
-                .get(
-                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
-                    &StubManager {
-                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
-                    },
-                )
-                .await
-                .expect("we have it");
-            assert!(
-                Weak::ptr_eq(
-                    &handle.myself,
-                    &child_shards_by_shard_number[i as usize].myself
-                ),
-                "mgr returns child"
-            );
-            drop(handle);
-        }
-
-        // all the while the parent handle kept the parent gate open
-        tokio::select! {
-            _ = parent_handle.gate.close() => {
-                panic!("parent handle is keeping gate open");
-            }
-            _ = tokio::time::sleep(FOREVER) => { }
-        }
-        drop(parent_handle);
-        tokio::select! {
-            _ = parent.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("parent handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn test_connection_handler_exit() {
-        crate::tenant::harness::setup_logging();
-        let timeline_id = TimelineId::generate();
-        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
-            gate: Default::default(),
-            id: timeline_id,
-            shard: ShardIdentity::unsharded(),
-            per_timeline_state: PerTimelineState::default(),
-            myself: myself.clone(),
-        });
-        let mgr = StubManager {
-            shards: vec![shard0.clone()],
-        };
-        let key = DBDIR_KEY;
-
-        // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
-        for _ in 0..10 {
-            let mut cache = Cache::<TestTypes>::default();
-            let handle = {
-                let handle = cache
-                    .get(timeline_id, ShardSelector::Page(key), &mgr)
-                    .await
-                    .expect("we have the timeline");
-                assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-                handle
-            };
-            handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
-        }
-
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
-        tokio::select! {
-            _ = shard0.gate.close() => { }
-            _ = tokio::time::sleep(FOREVER) => {
-                panic!("handle is dropped, no other gate holders exist")
-            }
-        }
-    }
-}
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -241,9 +241,6 @@ impl PostgresRedoManager {

    /// Shut down the WAL redo manager.
    ///
-    /// Returns `true` if this call was the one that initiated shutdown.
-    /// `true` may be observed by no caller if the first caller stops polling.
-    ///
    /// After this future completes
    /// - no redo process is running
    /// - no new redo process will be spawned
@@ -253,32 +250,22 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn shutdown(&self) -> bool {
+    pub async fn shutdown(&self) {
        // prevent new processes from being spawned
-        let maybe_permit = match self.redo_process.get_or_init_detached().await {
+        let permit = match self.redo_process.get_or_init_detached().await {
            Ok(guard) => {
-                if matches!(&*guard, ProcessOnceCell::ManagerShutDown) {
-                    None
-                } else {
-                    let (proc, permit) = guard.take_and_deinit();
-                    drop(proc); // this just drops the Arc, its refcount may not be zero yet
-                    Some(permit)
-                }
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
            }
-            Err(permit) => Some(permit),
-        };
-        let it_was_us = if let Some(permit) = maybe_permit {
-            self.redo_process
-                .set(ProcessOnceCell::ManagerShutDown, permit);
-            true
-        } else {
-            false
+            Err(permit) => permit,
        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
        // for the underlying process.
        self.launched_processes.close().await;
-        it_was_us
    }

    /// This type doesn't have its own background task to check for idleness: we
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -67,7 +67,6 @@ FALLBACK_DURATION = {
    "test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
    "test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback_with_snapshots": 575.735,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -53,6 +53,7 @@ diesel = { version = "2.1.4", features = [
 ] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
+dns-lookup = { version = "2.0.4" }

 utils = { path = "../libs/utils/" }
 metrics = { path = "../libs/metrics/" }
--- a/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/down.sql
@@ -0,0 +1 @@
+DROP TABLE leader;
--- a/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
+++ b/storage_controller/migrations/2024-07-26-140924_create_leader/up.sql
@@ -0,0 +1,6 @@
+CREATE TABLE leader (
+  hostname VARCHAR NOT NULL,
+  port INTEGER NOT NULL,
+  started_at TIMESTAMPTZ NOT NULL,
+  PRIMARY KEY(hostname, port, started_at)
+);
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -10,6 +10,7 @@ mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
+mod peer_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -81,6 +81,9 @@ struct Cli {
    #[arg(long, default_value = "5s")]
    db_connect_timeout: humantime::Duration,

+    #[arg(long, default_value = "false")]
+    start_as_candidate: bool,
+
    /// `neon_local` sets this to the path of the neon_local repo dir.
    /// Only relevant for testing.
    // TODO: make `cfg(feature = "testing")`
@@ -273,6 +276,8 @@ async fn async_main() -> anyhow::Result<()> {
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
        neon_local_repo_dir: args.neon_local_repo_dir,
+        start_as_candidate: args.start_as_candidate,
+        http_service_port: args.listen.port() as i32,
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -0,0 +1,104 @@
+use crate::tenant_shard::ObservedState;
+use pageserver_api::shard::TenantShardId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use tokio_util::sync::CancellationToken;
+
+use reqwest::{StatusCode, Url};
+use utils::{backoff, http::error::HttpErrorBody};
+
+#[derive(Debug, Clone)]
+pub(crate) struct PeerClient {
+    hostname: String,
+    port: i32,
+    jwt: Option<String>,
+    client: reqwest::Client,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum StorageControllerPeerError {
+    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    DeserializationError(StatusCode, Url, reqwest::Error),
+    #[error("storage controller peer API error ({0}): {1}")]
+    ApiError(StatusCode, String),
+    #[error("failed to send HTTP request: {0}")]
+    SendError(reqwest::Error),
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
+
+pub(crate) trait ResponseErrorMessageExt: Sized {
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
+}
+
+impl ResponseErrorMessageExt for reqwest::Response {
+    async fn error_from_body(self) -> Result<Self> {
+        let status = self.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            return Ok(self);
+        }
+
+        let url = self.url().to_owned();
+        Err(match self.json::<HttpErrorBody>().await {
+            Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
+            Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
+        })
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Default)]
+pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
+
+impl PeerClient {
+    pub(crate) fn new(hostname: String, port: i32, jwt: Option<String>) -> Self {
+        Self {
+            hostname,
+            port,
+            jwt,
+            client: reqwest::Client::new(),
+        }
+    }
+
+    async fn request_step_down(&self) -> Result<GlobalObservedState> {
+        let uri = format!("{}:{}/control/v1/step_down", self.hostname, self.port);
+        let req = self.client.put(uri);
+        let req = if let Some(jwt) = &self.jwt {
+            req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
+        } else {
+            req
+        };
+
+        let res = req
+            .send()
+            .await
+            .map_err(StorageControllerPeerError::SendError)?;
+        let response = res.error_from_body().await?;
+
+        let status = response.status();
+        let url = response.url().to_owned();
+
+        response
+            .json()
+            .await
+            .map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
+    }
+
+    pub(crate) async fn step_down(
+        &self,
+        cancel: &CancellationToken,
+    ) -> Result<GlobalObservedState> {
+        backoff::retry(
+            || self.request_step_down(),
+            |_e| false,
+            4,
+            8,
+            "Send step down request",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| StorageControllerPeerError::Cancelled)
+        .and_then(|x| x)
+    }
+}
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -95,6 +95,8 @@ pub(crate) enum DatabaseOperation {
    ListMetadataHealth,
    ListMetadataHealthUnhealthy,
    ListMetadataHealthOutdated,
+    GetLeader,
+    UpdateLeader,
 }

 #[must_use]
@@ -785,6 +787,71 @@ impl Persistence {
        )
        .await
    }
+
+    /// Get the current entry from the `leader` table if one exists.
+    /// It is an error for the table to contain more than one entry.
+    pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<LeaderPersistence>> {
+        let mut leader: Vec<LeaderPersistence> = self
+            .with_measured_conn(
+                DatabaseOperation::GetLeader,
+                move |conn| -> DatabaseResult<_> {
+                    Ok(crate::schema::leader::table.load::<LeaderPersistence>(conn)?)
+                },
+            )
+            .await?;
+
+        if leader.len() > 1 {
+            return Err(DatabaseError::Logical(format!(
+                "More than one entry present in the leader table: {leader:?}"
+            )));
+        }
+
+        Ok(leader.pop())
+    }
+
+    /// Update the new leader with compare-exchange semantics. If `prev` does not
+    /// match the current leader entry, then the update is treated as a failure.
+    /// When `prev` is not specified, the update is forced.
+    pub(crate) async fn update_leader(
+        &self,
+        prev: Option<LeaderPersistence>,
+        new: LeaderPersistence,
+    ) -> DatabaseResult<()> {
+        use crate::schema::leader::dsl::*;
+
+        let updated = self
+            .with_measured_conn(
+                DatabaseOperation::UpdateLeader,
+                move |conn| -> DatabaseResult<usize> {
+                    let updated = match &prev {
+                        Some(prev) => diesel::update(leader)
+                            .filter(hostname.eq(prev.hostname.clone()))
+                            .filter(port.eq(prev.port))
+                            .filter(started_at.eq(prev.started_at))
+                            .set((
+                                hostname.eq(new.hostname.clone()),
+                                port.eq(new.port),
+                                started_at.eq(new.started_at),
+                            ))
+                            .execute(conn)?,
+                        None => diesel::insert_into(leader)
+                            .values(new.clone())
+                            .execute(conn)?,
+                    };
+
+                    Ok(updated)
+                },
+            )
+            .await?;
+
+        if updated == 0 {
+            return Err(DatabaseError::Logical(
+                "Leader table update failed".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
 }

 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -910,3 +977,13 @@ impl From<MetadataHealthPersistence> for MetadataHealthRecord {
        }
    }
 }
+
+#[derive(
+    Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
+)]
+#[diesel(table_name = crate::schema::leader)]
+pub(crate) struct LeaderPersistence {
+    pub(crate) hostname: String,
+    pub(crate) port: i32,
+    pub(crate) started_at: chrono::DateTime<chrono::Utc>,
+}
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -656,8 +656,11 @@ impl Reconciler {
                    // reconcile this location.  This includes locations with different configurations, as well
                    // as locations with unknown (None) observed state.

-                    // Incrementing generation is the safe general case, but is inefficient for changes that only
-                    // modify some details (e.g. the tenant's config).
+                    // The general case is to increment the generation.  However, there are cases
+                    // where this is not necessary:
+                    // - if we are only updating the TenantConf part of the location
+                    // - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
+                    //   and the location was already in the correct generation
                    let increment_generation = match observed {
                        None => true,
                        Some(ObservedStateLocation { conf: None }) => true,
@@ -666,11 +669,18 @@ impl Reconciler {
                        }) => {
                            let generations_match = observed.generation == wanted_conf.generation;

-                            // We may skip incrementing the generation if the location is already in the expected mode and
-                            // generation.  In principle it would also be safe to skip from certain other modes (e.g. AttachedStale),
-                            // but such states are handled inside `live_migrate`, and if we see that state here we're cleaning up
-                            // after a restart/crash, so fall back to the universally safe path of incrementing generation.
-                            !generations_match || (observed.mode != wanted_conf.mode)
+                            use LocationConfigMode::*;
+                            let mode_transition_requires_gen_inc =
+                                match (observed.mode, wanted_conf.mode) {
+                                    // Usually the short-lived attachment modes (multi and stale) are only used
+                                    // in the case of [`Self::live_migrate`], but it is simple to handle them correctly
+                                    // here too.  Locations are allowed to go Single->Stale and Multi->Single within the same generation.
+                                    (AttachedSingle, AttachedStale) => false,
+                                    (AttachedMulti, AttachedSingle) => false,
+                                    (lhs, rhs) => lhs != rhs,
+                                };
+
+                            !generations_match || mode_transition_requires_gen_inc
                        }
                    };

--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -1,5 +1,13 @@
 // @generated automatically by Diesel CLI.

+diesel::table! {
+    leader (hostname, port, started_at) {
+        hostname -> Varchar,
+        port -> Int4,
+        started_at -> Timestamptz,
+    }
+}
+
 diesel::table! {
    metadata_health (tenant_id, shard_number, shard_count) {
        tenant_id -> Varchar,
@@ -36,4 +44,4 @@ diesel::table! {
    }
 }

-diesel::allow_tables_to_appear_in_same_query!(metadata_health, nodes, tenant_shards,);
+diesel::allow_tables_to_appear_in_same_query!(leader, metadata_health, nodes, tenant_shards,);
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -16,7 +16,10 @@ use crate::{
    compute_hook::NotifyError,
    id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
    metrics::LeadershipStatusGroup,
-    persistence::{AbortShardSplitStatus, MetadataHealthPersistence, TenantFilter},
+    peer_client::{GlobalObservedState, PeerClient},
+    persistence::{
+        AbortShardSplitStatus, LeaderPersistence, MetadataHealthPersistence, TenantFilter,
+    },
    reconciler::{ReconcileError, ReconcileUnits},
    scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
    tenant_shard::{
@@ -82,7 +85,6 @@ use crate::{
        ReconcilerWaiter, TenantShard,
    },
 };
-use serde::{Deserialize, Serialize};

 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -223,6 +225,7 @@ impl ServiceState {
        tenants: BTreeMap<TenantShardId, TenantShard>,
        scheduler: Scheduler,
        delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
+        initial_leadership_status: LeadershipStatus,
    ) -> Self {
        let status = &crate::metrics::METRICS_REGISTRY
            .metrics_group
@@ -230,15 +233,13 @@ impl ServiceState {

        status.set(
            LeadershipStatusGroup {
-                status: LeadershipStatus::Leader,
+                status: initial_leadership_status,
            },
            1,
        );

        Self {
-            // TODO: Starting up as Leader is a transient state. Once we enable rolling
-            // upgrades on the k8s side, we should start up as Candidate.
-            leadership_status: LeadershipStatus::Leader,
+            leadership_status: initial_leadership_status,
            tenants,
            nodes: Arc::new(nodes),
            scheduler,
@@ -287,6 +288,33 @@ impl ServiceState {
            0,
        );
    }
+
+    fn become_leader(&mut self) {
+        self.leadership_status = LeadershipStatus::Leader;
+
+        let status = &crate::metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_leadership_status;
+
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Leader,
+            },
+            1,
+        );
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::SteppedDown,
+            },
+            0,
+        );
+        status.set(
+            LeadershipStatusGroup {
+                status: LeadershipStatus::Candidate,
+            },
+            0,
+        );
+    }
 }

 #[derive(Clone)]
@@ -323,6 +351,10 @@ pub struct Config {

    // TODO: make this cfg(feature  = "testing")
    pub neon_local_repo_dir: Option<PathBuf>,
+
+    pub start_as_candidate: bool,
+
+    pub http_service_port: i32,
 }

 impl From<DatabaseError> for ApiError {
@@ -490,9 +522,10 @@ pub(crate) enum ReconcileResultRequest {
    Stop,
 }

-// TODO: move this into the storcon peer client when that gets added
-#[derive(Serialize, Deserialize, Debug, Default)]
-pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
+struct LeaderStepDownState {
+    observed: GlobalObservedState,
+    leader: LeaderPersistence,
+}

 impl Service {
    pub fn get_config(&self) -> &Config {
@@ -504,15 +537,11 @@ impl Service {
    #[instrument(skip_all)]
    async fn startup_reconcile(
        self: &Arc<Service>,
+        leader_step_down_state: Option<LeaderStepDownState>,
        bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
            Result<(), (TenantShardId, NotifyError)>,
        >,
    ) {
-        // For all tenant shards, a vector of observed states on nodes (where None means
-        // indeterminate, same as in [`ObservedStateLocation`])
-        let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
-            HashMap::new();
-
        // Startup reconciliation does I/O to other services: whether they
        // are responsive or not, we should aim to finish within our deadline, because:
        // - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -526,26 +555,29 @@ impl Service {
            .checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
            .expect("Reconcile timeout is a modest constant");

+        let (observed, current_leader) = if let Some(state) = leader_step_down_state {
+            tracing::info!(
+                "Using observed received from leader at {}:{}",
+                state.leader.hostname,
+                state.leader.port
+            );
+            (state.observed, Some(state.leader))
+        } else {
+            (
+                self.build_global_observed_state(node_scan_deadline).await,
+                None,
+            )
+        };
+
        // Accumulate a list of any tenant locations that ought to be detached
        let mut cleanup = Vec::new();

-        let node_listings = self.scan_node_locations(node_scan_deadline).await;
-        // Send initial heartbeat requests to nodes that replied to the location listing above.
-        let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
-
-        for (node_id, list_response) in node_listings {
-            let tenant_shards = list_response.tenant_shards;
-            tracing::info!(
-                "Received {} shard statuses from pageserver {}, setting it to Active",
-                tenant_shards.len(),
-                node_id
-            );
-
-            for (tenant_shard_id, conf_opt) in tenant_shards {
-                let shard_observations = observed.entry(tenant_shard_id).or_default();
-                shard_observations.push((node_id, conf_opt));
-            }
-        }
+        // Send initial heartbeat requests to all nodes loaded from the database
+        let all_nodes = {
+            let locked = self.inner.read().unwrap();
+            locked.nodes.clone()
+        };
+        let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;

        // List of tenants for which we will attempt to notify compute of their location at startup
        let mut compute_notifications = Vec::new();
@@ -568,17 +600,16 @@ impl Service {
            }
            *nodes = Arc::new(new_nodes);

-            for (tenant_shard_id, shard_observations) in observed {
-                for (node_id, observed_loc) in shard_observations {
-                    let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
-                        cleanup.push((tenant_shard_id, node_id));
-                        continue;
-                    };
-                    tenant_shard
-                        .observed
-                        .locations
-                        .insert(node_id, ObservedStateLocation { conf: observed_loc });
-                }
+            for (tenant_shard_id, observed_state) in observed.0 {
+                let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
+                    for node_id in observed_state.locations.keys() {
+                        cleanup.push((tenant_shard_id, *node_id));
+                    }
+
+                    continue;
+                };
+
+                tenant_shard.observed = observed_state;
            }

            // Populate each tenant's intent state
@@ -612,6 +643,22 @@ impl Service {
            tenants.len()
        };

+        // Before making any obeservable changes to the cluster, persist self
+        // as leader in database and memory.
+
+        let proposed_leader = self.get_proposed_leader_info();
+
+        if let Err(err) = self
+            .persistence
+            .update_leader(current_leader, proposed_leader)
+            .await
+        {
+            tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
+            std::process::exit(1);
+        }
+
+        self.inner.write().unwrap().become_leader();
+
        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
        // generation_pageserver in the database.

@@ -777,6 +824,31 @@ impl Service {
        node_results
    }

+    async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
+        let node_listings = self.scan_node_locations(deadline).await;
+        let mut observed = GlobalObservedState::default();
+
+        for (node_id, location_confs) in node_listings {
+            tracing::info!(
+                "Received {} shard statuses from pageserver {}",
+                location_confs.tenant_shards.len(),
+                node_id
+            );
+
+            for (tid, location_conf) in location_confs.tenant_shards {
+                let entry = observed.0.entry(tid).or_default();
+                entry.locations.insert(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: location_conf,
+                    },
+                );
+            }
+        }
+
+        observed
+    }
+
    /// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
    ///
    /// This is safe to run in the background, because if we don't have this TenantShardId in our map of
@@ -1255,12 +1327,20 @@ impl Service {
            config.max_warming_up_interval,
            cancel.clone(),
        );
+
+        let initial_leadership_status = if config.start_as_candidate {
+            LeadershipStatus::Candidate
+        } else {
+            LeadershipStatus::Leader
+        };
+
        let this = Arc::new(Self {
            inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
                nodes,
                tenants,
                scheduler,
                delayed_reconcile_rx,
+                initial_leadership_status,
            ))),
            config: config.clone(),
            persistence,
@@ -1329,7 +1409,16 @@ impl Service {
                    return;
                };

-                this.startup_reconcile(bg_compute_notify_result_tx).await;
+                let leadership_status = this.inner.read().unwrap().get_leadership_status();
+                let peer_observed_state = match leadership_status {
+                    LeadershipStatus::Candidate => this.request_step_down().await,
+                    LeadershipStatus::Leader => None,
+                    LeadershipStatus::SteppedDown => unreachable!(),
+                };
+
+                this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
+                    .await;
+
                drop(startup_completion);
            }
        });
@@ -6179,4 +6268,88 @@ impl Service {

        global_observed
    }
+
+    /// Collect the details for the current proccess wishing to become the storage controller
+    /// leader.
+    ///
+    /// On failures to discover and resolve the hostname the process is killed and we rely on k8s to retry.
+    fn get_proposed_leader_info(&self) -> LeaderPersistence {
+        let hostname = match dns_lookup::get_hostname() {
+            Ok(name) => name,
+            Err(err) => {
+                tracing::error!("Failed to discover hostname: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
+        };
+
+        let mut addrs = match dns_lookup::lookup_host(&hostname) {
+            Ok(addrs) => addrs,
+            Err(err) => {
+                tracing::error!("Failed to resolve hostname: {err}. Aborting start-up ...");
+                std::process::exit(1);
+            }
+        };
+
+        let addr = addrs
+            .pop()
+            .expect("k8s configured hostname always resolves");
+
+        let proposed = LeaderPersistence {
+            hostname: addr.to_string(),
+            port: self.get_config().http_service_port,
+            started_at: chrono::Utc::now(),
+        };
+
+        tracing::info!("Proposed leader details are: {proposed:?}");
+
+        proposed
+    }
+
+    /// Request step down from the currently registered leader in the database
+    ///
+    /// If such an entry is persisted, the success path returns the observed
+    /// state and details of the leader. Otherwise, None is returned indicating
+    /// there is no leader currently.
+    ///
+    /// On failures to query the database or step down error responses the process is killed
+    /// and we rely on k8s to retry.
+    async fn request_step_down(&self) -> Option<LeaderStepDownState> {
+        let leader = match self.persistence.get_leader().await {
+            Ok(leader) => leader,
+            Err(err) => {
+                tracing::error!(
+                    "Failed to query database for current leader: {err}. Aborting start-up ..."
+                );
+                std::process::exit(1);
+            }
+        };
+
+        match leader {
+            Some(leader) => {
+                // TODO: jwt token
+                let client = PeerClient::new(
+                    leader.hostname.to_owned(),
+                    leader.port,
+                    self.config.jwt_token.clone(),
+                );
+                let state = client.step_down(&self.cancel).await;
+                match state {
+                    Ok(state) => Some(LeaderStepDownState {
+                        observed: state,
+                        leader: leader.clone(),
+                    }),
+                    Err(err) => {
+                        tracing::error!(
+                            "Leader ({}:{}) did not respond to step-down request: {}",
+                            leader.hostname,
+                            leader.port,
+                            err
+                        );
+                        None
+                    }
+                }
+            }
+            None => None,
+        }
+    }
 }
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -19,8 +19,8 @@ use utils::id::TenantId;

 use crate::{
    cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
-    init_remote_generic, list_objects_with_retries_generic,
-    metadata_stream::{stream_tenant_timelines_generic, stream_tenants_generic},
+    init_remote, init_remote_generic, list_objects_with_retries,
+    metadata_stream::{stream_tenant_timelines, stream_tenants},
    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
 };

@@ -153,7 +153,7 @@ async fn find_garbage_inner(
    node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
    // Construct clients for S3 and for Console API
-    let (remote_client, target) = init_remote_generic(bucket_config.clone(), node_kind).await?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));

    // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -179,7 +179,7 @@ async fn find_garbage_inner(

    // Enumerate Tenants in S3, and check if each one exists in Console
    tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
-    let tenants = stream_tenants_generic(&remote_client, &target);
+    let tenants = stream_tenants(&s3_client, &target);
    let tenants_checked = tenants.map_ok(|t| {
        let api_client = cloud_admin_api_client.clone();
        let console_cache = console_cache.clone();
@@ -237,26 +237,25 @@ async fn find_garbage_inner(
        // Special case: If it's missing in console, check for known bugs that would enable us to conclusively
        // identify it as purge-able anyway
        if console_result.is_none() {
-            let timelines =
-                stream_tenant_timelines_generic(&remote_client, &target, tenant_shard_id)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await;
+            let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
+                .await?
+                .collect::<Vec<_>>()
+                .await;
            if timelines.is_empty() {
                // No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
-                let tenant_objects = list_objects_with_retries_generic(
-                    &remote_client,
-                    ListingMode::WithDelimiter,
+                let tenant_objects = list_objects_with_retries(
+                    &s3_client,
                    &target.tenant_root(&tenant_shard_id),
+                    None,
                )
                .await?;
-                let object = tenant_objects.keys.first().unwrap();
-                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
+                let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
+                if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
                    continue;
                } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
                }
            } else {
                // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -265,18 +264,24 @@ async fn find_garbage_inner(

                for timeline_r in timelines {
                    let timeline = timeline_r?;
-                    let timeline_objects = list_objects_with_retries_generic(
-                        &remote_client,
-                        ListingMode::WithDelimiter,
+                    let timeline_objects = list_objects_with_retries(
+                        &s3_client,
                        &target.timeline_root(&timeline),
+                        None,
                    )
                    .await?;
-                    if !timeline_objects.prefixes.is_empty() {
+                    if timeline_objects
+                        .common_prefixes
+                        .as_ref()
+                        .map(|v| v.len())
+                        .unwrap_or(0)
+                        > 0
+                    {
                        // Sub-paths?  Unexpected
                        any_non_initdb = true;
                    } else {
-                        let object = timeline_objects.keys.first().unwrap();
-                        if object.key.get_path().as_str().ends_with("initdb.tar.zst") {
+                        let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
+                        if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
                            tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
                        } else {
                            any_non_initdb = true;
@@ -331,8 +336,7 @@ async fn find_garbage_inner(

    // Construct a stream of all timelines within active tenants
    let active_tenants = tokio_stream::iter(active_tenants.iter().map(Ok));
-    let timelines =
-        active_tenants.map_ok(|t| stream_tenant_timelines_generic(&remote_client, &target, *t));
+    let timelines = active_tenants.map_ok(|t| stream_tenant_timelines(&s3_client, &target, *t));
    let timelines = timelines.try_buffer_unordered(S3_CONCURRENCY);
    let timelines = timelines.try_flatten();

--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -427,7 +427,6 @@ async fn list_objects_with_retries(
    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }

-/// Listing possibly large amounts of keys in a streaming fashion.
 fn stream_objects_with_retries<'a>(
    storage_client: &'a GenericRemoteStorage,
    listing_mode: ListingMode,
@@ -466,45 +465,6 @@ fn stream_objects_with_retries<'a>(
    }
 }

-/// If you want to list a bounded amount of prefixes or keys. For larger numbers of keys/prefixes,
-/// use [`stream_objects_with_retries`] instead.
-async fn list_objects_with_retries_generic(
-    remote_client: &GenericRemoteStorage,
-    listing_mode: ListingMode,
-    s3_target: &S3Target,
-) -> anyhow::Result<Listing> {
-    let cancel = CancellationToken::new();
-    let prefix_str = &s3_target
-        .prefix_in_bucket
-        .strip_prefix("/")
-        .unwrap_or(&s3_target.prefix_in_bucket);
-    let prefix = RemotePath::from_string(prefix_str)?;
-    for trial in 0..MAX_RETRIES {
-        match remote_client
-            .list(Some(&prefix), listing_mode, None, &cancel)
-            .await
-        {
-            Ok(response) => return Ok(response),
-            Err(e) => {
-                if trial == MAX_RETRIES - 1 {
-                    return Err(e)
-                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
-                }
-                error!(
-                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
-                    s3_target.bucket_name,
-                    s3_target.prefix_in_bucket,
-                    s3_target.delimiter,
-                    DisplayErrorContext(e),
-                );
-                let backoff_time = 1 << trial.max(5);
-                tokio::time::sleep(Duration::from_secs(backoff_time)).await;
-            }
-        }
-    }
-    panic!("MAX_RETRIES is not allowed to be 0");
-}
-
 async fn download_object_with_retries(
    s3_client: &Client,
    bucket_name: &str,
--- a/storage_scrubber/src/metadata_stream.rs
+++ b/storage_scrubber/src/metadata_stream.rs
@@ -189,63 +189,6 @@ pub async fn stream_tenant_timelines<'a>(
    })
 }

-/// Given a `TenantShardId`, output a stream of the timelines within that tenant, discovered
-/// using a listing. The listing is done before the stream is built, so that this
-/// function can be used to generate concurrency on a stream using buffer_unordered.
-pub async fn stream_tenant_timelines_generic<'a>(
-    remote_client: &'a GenericRemoteStorage,
-    target: &'a RootTarget,
-    tenant: TenantShardId,
-) -> anyhow::Result<impl Stream<Item = Result<TenantShardTimelineId, anyhow::Error>> + 'a> {
-    let mut timeline_ids: Vec<Result<TimelineId, anyhow::Error>> = Vec::new();
-    let timelines_target = target.timelines_root(&tenant);
-
-    let mut objects_stream = std::pin::pin!(stream_objects_with_retries(
-        remote_client,
-        ListingMode::WithDelimiter,
-        &timelines_target
-    ));
-    loop {
-        tracing::debug!("Listing in {tenant}");
-        let fetch_response = match objects_stream.next().await {
-            None => break,
-            Some(Err(e)) => {
-                timeline_ids.push(Err(e));
-                break;
-            }
-            Some(Ok(r)) => r,
-        };
-
-        let new_entry_ids = fetch_response
-            .prefixes
-            .iter()
-            .filter_map(|prefix| -> Option<&str> {
-                prefix
-                    .get_path()
-                    .as_str()
-                    .strip_prefix(&timelines_target.prefix_in_bucket)?
-                    .strip_suffix('/')
-            })
-            .map(|entry_id_str| {
-                entry_id_str
-                    .parse::<TimelineId>()
-                    .with_context(|| format!("Incorrect entry id str: {entry_id_str}"))
-            });
-
-        for i in new_entry_ids {
-            timeline_ids.push(i);
-        }
-    }
-
-    tracing::debug!("Yielding for {}", tenant);
-    Ok(stream! {
-        for i in timeline_ids {
-            let id = i?;
-            yield Ok(TenantShardTimelineId::new(tenant, id));
-        }
-    })
-}
-
 pub(crate) fn stream_listing<'a>(
    s3_client: &'a Client,
    target: &'a S3Target,
--- a/test_runner/performance/test_gc_feedback.py
+++ b/test_runner/performance/test_gc_feedback.py
@@ -6,8 +6,21 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder


-def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, mode: str):
-    assert mode == "normal" or mode == "with_snapshots"
+@pytest.mark.timeout(10000)
+def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    """
+    Test that GC is able to collect all old layers even if them are forming
+    "stairs" and there are not three delta layers since last image layer.
+
+    Information about image layers needed to collect old layers should
+    be propagated by GC to compaction task which should take in in account
+    when make a decision which new image layers needs to be created.
+
+    NB: this test demonstrates the problem. The source tree contained the
+    `gc_feedback` mechanism for about 9 months, but, there were problems
+    with it and it wasn't enabled at runtime.
+    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
+    """
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()

@@ -61,9 +74,6 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma

            physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
            log.info(f"Physical storage size {physical_size}")
-        if mode == "with_snapshots":
-            if step == n_steps / 2:
-                env.neon_cli.create_branch("child")

    max_num_of_deltas_above_image = 0
    max_total_num_of_deltas = 0
@@ -139,37 +149,3 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
    log.info(f"Writing layer map to {layer_map_path}")
    with layer_map_path.open("w") as f:
        f.write(json.dumps(client.timeline_layer_map_info(tenant_id, timeline_id)))
-
-
-@pytest.mark.timeout(10000)
-def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
-    """
-    Test that GC is able to collect all old layers even if them are forming
-    "stairs" and there are not three delta layers since last image layer.
-
-    Information about image layers needed to collect old layers should
-    be propagated by GC to compaction task which should take in in account
-    when make a decision which new image layers needs to be created.
-
-    NB: this test demonstrates the problem. The source tree contained the
-    `gc_feedback` mechanism for about 9 months, but, there were problems
-    with it and it wasn't enabled at runtime.
-    This PR removed the code: https://github.com/neondatabase/neon/pull/6863
-
-    And the bottom-most GC-compaction epic resolves the problem.
-    https://github.com/neondatabase/neon/issues/8002
-    """
-    gc_feedback_impl(neon_env_builder, zenbenchmark, "normal")
-
-
-@pytest.mark.timeout(10000)
-def test_gc_feedback_with_snapshots(
-    neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
-):
-    """
-    Compared with `test_gc_feedback`, we create a branch without written data (=snapshot) in the middle
-    of the benchmark, and the   bottom-most compaction should collect as much garbage as possible below the GC
-    horizon. Ideally, there should be images (in an image layer) covering the full range at the branch point,
-    and images covering the full key range (in a delta layer) at the GC horizon.
-    """
-    gc_feedback_impl(neon_env_builder, zenbenchmark, "with_snapshots")
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -277,12 +277,8 @@ files:
        help: 'Bytes between received and replayed LSN'
        key_labels:
        values: [replication_delay_bytes]
-        # We use a GREATEST call here because this calculation can be negative.
-        # The calculation is not atomic, meaning after we've gotten the receive
-        # LSN, the replay LSN may have advanced past the receive LSN we
-        # are using for the calculation.
        query: |
-          SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
+          SELECT pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn()) AS replication_delay_bytes;

      - metric_name: replication_delay_seconds
        type: gauge
Author	SHA1	Message	Date
Vlad Lazar	71ff8f2433	storcon: implement graceful leader cutover	2024-07-30 17:58:18 +01:00
Vlad Lazar	56c43c4fae	storcon: add start-up sequence utilities	2024-07-30 17:58:17 +01:00
Vlad Lazar	4187657082	storcon: refactor building of observed state at start-up	2024-07-30 17:57:09 +01:00
Vlad Lazar	b690ba5838	storcon: decouple initial heartbeat round from location listing	2024-07-30 17:57:09 +01:00
Vlad Lazar	dd7cafdd97	storcon: add storage controller peer client	2024-07-30 17:57:08 +01:00
Vlad Lazar	c501a10612	storcon: gate starting-up as candidate behind a flag	2024-07-30 17:56:30 +01:00
Vlad Lazar	1fdbef9a44	storcon/persistence: add leader table primitives	2024-07-30 17:56:28 +01:00
Vlad Lazar	3ad1221e55	storcon/diesel: add leader table	2024-07-30 17:54:02 +01:00