neon/pageserver/src/tenant.rs

//! Timeline repository implementation that keeps old data in layer files, and
//! the recent changes in ephemeral files.
//!
//! See tenant/*_layer.rs files. The functions here are responsible for locating
//! the correct layer for the get/put call, walking back the timeline branching
//! history as needed.
//!
//! The files are stored in the .neon/tenants/<tenant_id>/timelines/<timeline_id>
//! directory. See docs/pageserver-storage.md for how the files are managed.
//! In addition to the layer files, there is a metadata file in the same
//! directory that contains information about the timeline, in particular its
//! parent timeline, and the last LSN that has been written to disk.
//!

use anyhow::{bail, Context};
use arc_swap::ArcSwap;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use chrono::NaiveDateTime;
use enumset::EnumSet;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::LsnLease;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::TimelineState;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::WalRedoManagerStatus;
use pageserver_api::shard::ShardIdentity;
use pageserver_api::shard::ShardStripeSize;
use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use remote_storage::TimeoutOrCancel;
use remote_timeline_client::manifest::{
    OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
};
use remote_timeline_client::UploadQueueNotReadyError;
use std::collections::BTreeMap;
use std::fmt;
use std::future::Future;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::offload::offload_timeline;
use tokio::io::BufReader;
use tokio::sync::watch;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
use upload_queue::NotInitialized;
use utils::backoff;
use utils::circuit_breaker::CircuitBreaker;
use utils::completion;
use utils::crashsafe::path_with_suffix_extension;
use utils::failpoint_support;
use utils::fs_ext;
use utils::pausable_failpoint;
use utils::sync::gate::Gate;
use utils::sync::gate::GateGuard;
use utils::timeout::timeout_cancellable;
use utils::timeout::TimeoutCancellableError;
use utils::zstd::create_zst_tarball;
use utils::zstd::extract_zst_tarball;

use self::config::AttachedLocationConfig;
use self::config::AttachmentMode;
use self::config::LocationConf;
use self::config::TenantConf;
use self::metadata::TimelineMetadata;
use self::mgr::GetActiveTenantError;
use self::mgr::GetTenantError;
use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest};
use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
use self::timeline::uninit::TimelineCreateGuard;
use self::timeline::uninit::TimelineExclusionError;
use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
use self::timeline::GcCutoffs;
use self::timeline::TimelineDeleteProgress;
use self::timeline::TimelineResources;
use self::timeline::WaitLsnError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::TENANT;
use crate::metrics::{
    remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
};
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationMode;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::gc_result::GcResult;
pub use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
use crate::walingest::WalLagCooldown;
use crate::walredo;
use crate::InitializationOrder;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::Debug;
use std::fmt::Display;
use std::fs;
use std::fs::File;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::sync::Mutex;
use std::time::{Duration, Instant};

use crate::span;
use crate::tenant::timeline::delete::DeleteTimelineFlow;
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
use crate::virtual_file::VirtualFile;
use crate::walredo::PostgresRedoManager;
use crate::TEMP_FILE_SUFFIX;
use once_cell::sync::Lazy;
pub use pageserver_api::models::TenantState;
use tokio::sync::Semaphore;

static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
use utils::{
    crashsafe,
    generation::Generation,
    id::TimelineId,
    lsn::{Lsn, RecordLsn},
};

pub mod blob_io;
pub mod block_io;
pub mod vectored_blob_io;

pub mod disk_btree;
pub(crate) mod ephemeral_file;
pub mod layer_map;

pub mod metadata;
pub mod remote_timeline_client;
pub mod storage_layer;

pub mod checks;
pub mod config;
pub mod mgr;
pub mod secondary;
pub mod tasks;
pub mod upload_queue;

pub(crate) mod timeline;

pub mod size;

mod gc_block;
mod gc_result;
pub(crate) mod throttle;

pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};

// re-export for use in walreceiver
pub use crate::tenant::timeline::WalReceiverInfo;

/// The "tenants" part of `tenants/<tenant>/timelines...`
pub const TENANTS_SEGMENT_NAME: &str = "tenants";

/// Parts of the `.neon/tenants/<tenant_id>/timelines/<timeline_id>` directory prefix.
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

/// References to shared objects that are passed into each tenant, such
/// as the shared remote storage client and process initialization state.
#[derive(Clone)]
pub struct TenantSharedResources {
    pub broker_client: storage_broker::BrokerClientChannel,
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
}

/// A [`Tenant`] is really an _attached_ tenant.  The configuration
/// for an attached tenant is a subset of the [`LocationConf`], represented
/// in this struct.
pub(super) struct AttachedTenantConf {
    tenant_conf: TenantConfOpt,
    location: AttachedLocationConfig,
    /// The deadline before which we are blocked from GC so that
    /// leases have a chance to be renewed.
    lsn_lease_deadline: Option<tokio::time::Instant>,
}

impl AttachedTenantConf {
    fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
        // Sets a deadline before which we cannot proceed to GC due to lsn lease.
        //
        // We do this as the leases mapping are not persisted to disk. By delaying GC by lease
        // length, we guarantee that all the leases we granted before will have a chance to renew
        // when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
        let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
            Some(
                tokio::time::Instant::now()
                    + tenant_conf
                        .lsn_lease_length
                        .unwrap_or(LsnLease::DEFAULT_LENGTH),
            )
        } else {
            // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
            // because we don't do GC in these modes.
            None
        };

        Self {
            tenant_conf,
            location,
            lsn_lease_deadline,
        }
    }

    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
        match &location_conf.mode {
            LocationMode::Attached(attach_conf) => {
                Ok(Self::new(location_conf.tenant_conf, *attach_conf))
            }
            LocationMode::Secondary(_) => {
                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
            }
        }
    }

    fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
        self.lsn_lease_deadline
            .map(|d| tokio::time::Instant::now() < d)
            .unwrap_or(false)
    }
}
struct TimelinePreload {
    timeline_id: TimelineId,
    client: RemoteTimelineClient,
    index_part: Result<MaybeDeletedIndexPart, DownloadError>,
}

pub(crate) struct TenantPreload {
    tenant_manifest: TenantManifest,
    timelines: HashMap<TimelineId, TimelinePreload>,
}

/// When we spawn a tenant, there is a special mode for tenant creation that
/// avoids trying to read anything from remote storage.
pub(crate) enum SpawnMode {
    /// Activate as soon as possible
    Eager,
    /// Lazy activation in the background, with the option to skip the queue if the need comes up
    Lazy,
}

///
/// Tenant consists of multiple timelines. Keep them in a hash table.
///
pub struct Tenant {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

    /// The value creation timestamp, used to measure activation delay, see:
    /// <https://github.com/neondatabase/neon/issues/4025>
    constructed_at: Instant,

    state: watch::Sender<TenantState>,

    // Overridden tenant-specific config parameters.
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,

    tenant_shard_id: TenantShardId,

    // The detailed sharding information, beyond the number/count in tenant_shard_id
    shard_identity: ShardIdentity,

    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,

    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,

    /// During timeline creation, we first insert the TimelineId to the
    /// creating map, then `timelines`, then remove it from the creating map.
    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,

    /// Possibly offloaded and archived timelines
    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,

    /// Serialize writes of the tenant manifest to remote storage.  If there are concurrent operations
    /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
    /// each other (this could be optimized to coalesce writes if necessary).
    ///
    /// The contents of the Mutex are the last manifest we successfully uploaded
    tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,

    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
    // `timelines` mutex during all GC iteration
    // may block for a long time `get_timeline`, `get_timelines_state`,... and other operations
    // with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
    // timeout...
    gc_cs: tokio::sync::Mutex<()>,
    walredo_mgr: Option<Arc<WalRedoManager>>,

    // provides access to timeline data sitting in the remote storage
    pub(crate) remote_storage: GenericRemoteStorage,

    // Access to global deletion queue for when this tenant wants to schedule a deletion
    deletion_queue_client: DeletionQueueClient,

    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

    /// Track repeated failures to compact, so that we can back off.
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
    /// background warmup.
    pub(crate) activate_now_sem: tokio::sync::Semaphore,

    /// Time it took for the tenant to activate. Zero if not active yet.
    attach_wal_lag_cooldown: Arc<std::sync::OnceLock<WalLagCooldown>>,

    // Cancellation token fires when we have entered shutdown().  This is a parent of
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,

    // Users of the Tenant such as the page service must take this Gate to avoid
    // trying to use a Tenant which is shutting down.
    pub(crate) gate: Gate,

    /// Throttle applied at the top of [`Timeline::get`].
    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) timeline_get_throttle:
        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,

    /// An ongoing timeline detach concurrency limiter.
    ///
    /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
    /// to have two running at the same time. A different one can be started if an earlier one
    /// has failed for whatever reason.
    ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,

    /// `index_part.json` based gc blocking reason tracking.
    ///
    /// New gc iterations must start a new iteration by acquiring `GcBlock::start` before
    /// proceeding.
    pub(crate) gc_block: gc_block::GcBlock,

    l0_flush_global_state: L0FlushGlobalState,
}

impl std::fmt::Debug for Tenant {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
    }
}

pub(crate) enum WalRedoManager {
    Prod(WalredoManagerId, PostgresRedoManager),
    #[cfg(test)]
    Test(harness::TestRedoManager),
}

#[derive(thiserror::Error, Debug)]
#[error("pageserver is shutting down")]
pub(crate) struct GlobalShutDown;

impl WalRedoManager {
    pub(crate) fn new(mgr: PostgresRedoManager) -> Result<Arc<Self>, GlobalShutDown> {
        let id = WalredoManagerId::next();
        let arc = Arc::new(Self::Prod(id, mgr));
        let mut guard = WALREDO_MANAGERS.lock().unwrap();
        match &mut *guard {
            Some(map) => {
                map.insert(id, Arc::downgrade(&arc));
                Ok(arc)
            }
            None => Err(GlobalShutDown),
        }
    }
}

impl Drop for WalRedoManager {
    fn drop(&mut self) {
        match self {
            Self::Prod(id, _) => {
                let mut guard = WALREDO_MANAGERS.lock().unwrap();
                if let Some(map) = &mut *guard {
                    map.remove(id).expect("new() registers, drop() unregisters");
                }
            }
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
            }
        }
    }
}

/// Global registry of all walredo managers so that [`crate::shutdown_pageserver`] can shut down
/// the walredo processes outside of the regular order.
///
/// This is necessary to work around a systemd bug where it freezes if there are
/// walredo processes left => <https://github.com/neondatabase/cloud/issues/11387>
#[allow(clippy::type_complexity)]
pub(crate) static WALREDO_MANAGERS: once_cell::sync::Lazy<
    Mutex<Option<HashMap<WalredoManagerId, Weak<WalRedoManager>>>>,
> = once_cell::sync::Lazy::new(|| Mutex::new(Some(HashMap::new())));
#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)]
pub(crate) struct WalredoManagerId(u64);
impl WalredoManagerId {
    pub fn next() -> Self {
        static NEXT: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(1);
        let id = NEXT.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
        if id == 0 {
            panic!("WalredoManagerId::new() returned 0, indicating wraparound, risking it's no longer unique");
        }
        Self(id)
    }
}

#[cfg(test)]
impl From<harness::TestRedoManager> for WalRedoManager {
    fn from(mgr: harness::TestRedoManager) -> Self {
        Self::Test(mgr)
    }
}

impl WalRedoManager {
    pub(crate) async fn shutdown(&self) -> bool {
        match self {
            Self::Prod(_, mgr) => mgr.shutdown().await,
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
                true
            }
        }
    }

    pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
        match self {
            Self::Prod(_, mgr) => mgr.maybe_quiesce(idle_timeout),
            #[cfg(test)]
            Self::Test(_) => {
                // Not applicable to test redo manager
            }
        }
    }

    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
    pub async fn request_redo(
        &self,
        key: pageserver_api::key::Key,
        lsn: Lsn,
        base_img: Option<(Lsn, bytes::Bytes)>,
        records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
        pg_version: u32,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
            Self::Prod(_, mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
            #[cfg(test)]
            Self::Test(mgr) => {
                mgr.request_redo(key, lsn, base_img, records, pg_version)
                    .await
            }
        }
    }

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
            WalRedoManager::Prod(_, m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
    }
}

/// A very lightweight memory representation of an offloaded timeline.
///
/// We need to store the list of offloaded timelines so that we can perform operations on them,
/// like unoffloading them, or (at a later date), decide to perform flattening.
/// This type has a much smaller memory impact than [`Timeline`], and thus we can store many
/// more offloaded timelines than we can manage ones that aren't.
pub struct OffloadedTimeline {
    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
    pub ancestor_timeline_id: Option<TimelineId>,
    /// Whether to retain the branch lsn at the ancestor or not
    pub ancestor_retain_lsn: Option<Lsn>,

    /// When the timeline was archived.
    ///
    /// Present for future flattening deliberations.
    pub archived_at: NaiveDateTime,

    /// Prevent two tasks from deleting the timeline at the same time. If held, the
    /// timeline is being deleted. If 'true', the timeline has already been deleted.
    pub delete_progress: TimelineDeleteProgress,
}

impl OffloadedTimeline {
    /// Obtains an offloaded timeline from a given timeline object.
    ///
    /// Returns `None` if the `archived_at` flag couldn't be obtained, i.e.
    /// the timeline is not in a stopped state.
    /// Panics if the timeline is not archived.
    fn from_timeline(timeline: &Timeline) -> Result<Self, UploadQueueNotReadyError> {
        let ancestor_retain_lsn = timeline
            .get_ancestor_timeline_id()
            .map(|_timeline_id| timeline.get_ancestor_lsn());
        let archived_at = timeline
            .remote_client
            .archived_at_stopped_queue()?
            .expect("must be called on an archived timeline");
        Ok(Self {
            tenant_shard_id: timeline.tenant_shard_id,
            timeline_id: timeline.timeline_id,
            ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
            ancestor_retain_lsn,
            archived_at,

            delete_progress: timeline.delete_progress.clone(),
        })
    }
    fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
        let OffloadedTimelineManifest {
            timeline_id,
            ancestor_timeline_id,
            ancestor_retain_lsn,
            archived_at,
        } = *manifest;
        Self {
            tenant_shard_id,
            timeline_id,
            ancestor_timeline_id,
            ancestor_retain_lsn,
            archived_at,
            delete_progress: TimelineDeleteProgress::default(),
        }
    }
    fn manifest(&self) -> OffloadedTimelineManifest {
        let Self {
            timeline_id,
            ancestor_timeline_id,
            ancestor_retain_lsn,
            archived_at,
            ..
        } = self;
        OffloadedTimelineManifest {
            timeline_id: *timeline_id,
            ancestor_timeline_id: *ancestor_timeline_id,
            ancestor_retain_lsn: *ancestor_retain_lsn,
            archived_at: *archived_at,
        }
    }
}

impl fmt::Debug for OffloadedTimeline {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "OffloadedTimeline<{}>", self.timeline_id)
    }
}

#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub enum MaybeOffloaded {
    Yes,
    No,
}

#[derive(Clone, Debug)]
pub enum TimelineOrOffloaded {
    Timeline(Arc<Timeline>),
    Offloaded(Arc<OffloadedTimeline>),
}

impl TimelineOrOffloaded {
    pub fn arc_ref(&self) -> TimelineOrOffloadedArcRef<'_> {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => {
                TimelineOrOffloadedArcRef::Timeline(timeline)
            }
            TimelineOrOffloaded::Offloaded(offloaded) => {
                TimelineOrOffloadedArcRef::Offloaded(offloaded)
            }
        }
    }
    pub fn tenant_shard_id(&self) -> TenantShardId {
        self.arc_ref().tenant_shard_id()
    }
    pub fn timeline_id(&self) -> TimelineId {
        self.arc_ref().timeline_id()
    }
    pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => &timeline.delete_progress,
            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
        }
    }
    fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
            TimelineOrOffloaded::Offloaded(_offloaded) => None,
        }
    }
}

pub enum TimelineOrOffloadedArcRef<'a> {
    Timeline(&'a Arc<Timeline>),
    Offloaded(&'a Arc<OffloadedTimeline>),
}

impl TimelineOrOffloadedArcRef<'_> {
    pub fn tenant_shard_id(&self) -> TenantShardId {
        match self {
            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
        }
    }
    pub fn timeline_id(&self) -> TimelineId {
        match self {
            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
        }
    }
}

impl<'a> From<&'a Arc<Timeline>> for TimelineOrOffloadedArcRef<'a> {
    fn from(timeline: &'a Arc<Timeline>) -> Self {
        Self::Timeline(timeline)
    }
}

impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
    fn from(timeline: &'a Arc<OffloadedTimeline>) -> Self {
        Self::Offloaded(timeline)
    }
}

#[derive(Debug, thiserror::Error, PartialEq, Eq)]
pub enum GetTimelineError {
    #[error("Timeline is shutting down")]
    ShuttingDown,
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
        tenant_id: TenantShardId,
        timeline_id: TimelineId,
        state: TimelineState,
    },
    #[error("Timeline {tenant_id}/{timeline_id} was not found")]
    NotFound {
        tenant_id: TenantShardId,
        timeline_id: TimelineId,
    },
}

#[derive(Debug, thiserror::Error)]
pub enum LoadLocalTimelineError {
    #[error("FailedToLoad")]
    Load(#[source] anyhow::Error),
    #[error("FailedToResumeDeletion")]
    ResumeDeletion(#[source] anyhow::Error),
}

#[derive(thiserror::Error)]
pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,

    #[error("HasChildren")]
    HasChildren(Vec<TimelineId>),

    #[error("Timeline deletion is already in progress")]
    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),

    #[error(transparent)]
    Other(#[from] anyhow::Error),
}

impl Debug for DeleteTimelineError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::NotFound => write!(f, "NotFound"),
            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
        }
    }
}

#[derive(thiserror::Error)]
pub enum TimelineArchivalError {
    #[error("NotFound")]
    NotFound,

    #[error("Timeout")]
    Timeout,

    #[error("Cancelled")]
    Cancelled,

    #[error("ancestor is archived: {}", .0)]
    HasArchivedParent(TimelineId),

    #[error("HasUnarchivedChildren")]
    HasUnarchivedChildren(Vec<TimelineId>),

    #[error("Timeline archival is already in progress")]
    AlreadyInProgress,

    #[error(transparent)]
    Other(anyhow::Error),
}

#[derive(thiserror::Error, Debug)]
pub(crate) enum TenantManifestError {
    #[error("Remote storage error: {0}")]
    RemoteStorage(anyhow::Error),

    #[error("Cancelled")]
    Cancelled,
}

impl From<TenantManifestError> for TimelineArchivalError {
    fn from(e: TenantManifestError) -> Self {
        match e {
            TenantManifestError::RemoteStorage(e) => Self::Other(e),
            TenantManifestError::Cancelled => Self::Cancelled,
        }
    }
}

impl Debug for TimelineArchivalError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::NotFound => write!(f, "NotFound"),
            Self::Timeout => write!(f, "Timeout"),
            Self::Cancelled => write!(f, "Cancelled"),
            Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
            Self::HasUnarchivedChildren(c) => {
                f.debug_tuple("HasUnarchivedChildren").field(c).finish()
            }
            Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
        }
    }
}

pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
}

impl Debug for SetStoppingError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
            Self::Broken => write!(f, "Broken"),
        }
    }
}

/// Arguments to [`Tenant::create_timeline`].
///
/// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
/// is `None`, the result of the timeline create call is not deterministic.
///
/// See [`CreateTimelineIdempotency`] for an idempotency key.
#[derive(Debug)]
pub(crate) enum CreateTimelineParams {
    Bootstrap(CreateTimelineParamsBootstrap),
    Branch(CreateTimelineParamsBranch),
}

#[derive(Debug)]
pub(crate) struct CreateTimelineParamsBootstrap {
    pub(crate) new_timeline_id: TimelineId,
    pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
    pub(crate) pg_version: u32,
}

/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
#[derive(Debug)]
pub(crate) struct CreateTimelineParamsBranch {
    pub(crate) new_timeline_id: TimelineId,
    pub(crate) ancestor_timeline_id: TimelineId,
    pub(crate) ancestor_start_lsn: Option<Lsn>,
}

/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`].
///
/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
///
/// We lower timeline creation requests to [`Self`], and then use [`PartialEq::eq`] to compare [`Timeline::create_idempotency`] with the request.
/// If they are equal, we return a reference to the existing timeline, otherwise it's an idempotency conflict.
///
/// There is special treatment for [`Self::FailWithConflict`] to always return an idempotency conflict.
/// It would be nice to have more advanced derive macros to make that special treatment declarative.
///
/// Notes:
/// - Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
/// - We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
///   is not considered for idempotency. We can improve on this over time if we deem it necessary.
///
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum CreateTimelineIdempotency {
    /// NB: special treatment, see comment in [`Self`].
    FailWithConflict,
    Bootstrap {
        pg_version: u32,
    },
    /// NB: branches always have the same `pg_version` as their ancestor.
    /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
    /// exists as a field, and is set by cplane, it has always been ignored by pageserver when
    /// determining the child branch pg_version.
    Branch {
        ancestor_timeline_id: TimelineId,
        ancestor_start_lsn: Lsn,
    },
}

/// What is returned by [`Tenant::start_creating_timeline`].
#[must_use]
enum StartCreatingTimelineResult<'t> {
    CreateGuard(TimelineCreateGuard<'t>),
    Idempotent(Arc<Timeline>),
}

/// What is returned by [`Tenant::create_timeline`].
enum CreateTimelineResult {
    Created(Arc<Timeline>),
    Idempotent(Arc<Timeline>),
}

impl CreateTimelineResult {
    fn discriminant(&self) -> &'static str {
        match self {
            Self::Created(_) => "Created",
            Self::Idempotent(_) => "Idempotent",
        }
    }
    fn timeline(&self) -> &Arc<Timeline> {
        match self {
            Self::Created(t) | Self::Idempotent(t) => t,
        }
    }
    /// Unit test timelines aren't activated, test has to do it if it needs to.
    #[cfg(test)]
    fn into_timeline_for_test(self) -> Arc<Timeline> {
        match self {
            Self::Created(t) | Self::Idempotent(t) => t,
        }
    }
}

#[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError {
    #[error("creation of timeline with the given ID is in progress")]
    AlreadyCreating,
    #[error("timeline already exists with different parameters")]
    Conflict,
    #[error(transparent)]
    AncestorLsn(anyhow::Error),
    #[error("ancestor timeline is not active")]
    AncestorNotActive,
    #[error("ancestor timeline is archived")]
    AncestorArchived,
    #[error("tenant shutting down")]
    ShuttingDown,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
}

#[derive(thiserror::Error, Debug)]
enum InitdbError {
    Other(anyhow::Error),
    Cancelled,
    Spawn(std::io::Result<()>),
    Failed(std::process::ExitStatus, Vec<u8>),
}

impl fmt::Display for InitdbError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            InitdbError::Cancelled => write!(f, "Operation was cancelled"),
            InitdbError::Spawn(e) => write!(f, "Spawn error: {:?}", e),
            InitdbError::Failed(status, stderr) => write!(
                f,
                "Command failed with status {:?}: {}",
                status,
                String::from_utf8_lossy(stderr)
            ),
            InitdbError::Other(e) => write!(f, "Error: {:?}", e),
        }
    }
}

impl From<std::io::Error> for InitdbError {
    fn from(error: std::io::Error) -> Self {
        InitdbError::Spawn(Err(error))
    }
}

enum CreateTimelineCause {
    Load,
    Delete,
}

#[derive(thiserror::Error, Debug)]
pub(crate) enum GcError {
    // The tenant is shutting down
    #[error("tenant shutting down")]
    TenantCancelled,

    // The tenant is shutting down
    #[error("timeline shutting down")]
    TimelineCancelled,

    // The tenant is in a state inelegible to run GC
    #[error("not active")]
    NotActive,

    // A requested GC cutoff LSN was invalid, for example it tried to move backwards
    #[error("not active")]
    BadLsn { why: String },

    // A remote storage error while scheduling updates after compaction
    #[error(transparent)]
    Remote(anyhow::Error),

    // An error reading while calculating GC cutoffs
    #[error(transparent)]
    GcCutoffs(PageReconstructError),

    // If GC was invoked for a particular timeline, this error means it didn't exist
    #[error("timeline not found")]
    TimelineNotFound,
}

impl From<PageReconstructError> for GcError {
    fn from(value: PageReconstructError) -> Self {
        match value {
            PageReconstructError::Cancelled => Self::TimelineCancelled,
            other => Self::GcCutoffs(other),
        }
    }
}

impl From<NotInitialized> for GcError {
    fn from(value: NotInitialized) -> Self {
        match value {
            NotInitialized::Uninitialized => GcError::Remote(value.into()),
            NotInitialized::Stopped | NotInitialized::ShuttingDown => GcError::TimelineCancelled,
        }
    }
}

impl From<timeline::layer_manager::Shutdown> for GcError {
    fn from(_: timeline::layer_manager::Shutdown) -> Self {
        GcError::TimelineCancelled
    }
}

#[derive(thiserror::Error, Debug)]
pub(crate) enum LoadConfigError {
    #[error("TOML deserialization error: '{0}'")]
    DeserializeToml(#[from] toml_edit::de::Error),

    #[error("Config not found at {0}")]
    NotFound(Utf8PathBuf),
}

impl Tenant {
    /// Yet another helper for timeline initialization.
    ///
    /// - Initializes the Timeline struct and inserts it into the tenant's hash map
    /// - Scans the local timeline directory for layer files and builds the layer map
    /// - Downloads remote index file and adds remote files to the layer map
    /// - Schedules remote upload tasks for any files that are present locally but missing from remote storage.
    ///
    /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
    /// it is marked as Active.
    #[allow(clippy::too_many_arguments)]
    async fn timeline_init_and_sync(
        &self,
        timeline_id: TimelineId,
        resources: TimelineResources,
        index_part: IndexPart,
        metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let tenant_id = self.tenant_shard_id;

        let idempotency = if metadata.ancestor_timeline().is_none() {
            CreateTimelineIdempotency::Bootstrap {
                pg_version: metadata.pg_version(),
            }
        } else {
            CreateTimelineIdempotency::Branch {
                ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
                ancestor_start_lsn: metadata.ancestor_lsn(),
            }
        };

        let timeline = self.create_timeline_struct(
            timeline_id,
            &metadata,
            ancestor.clone(),
            resources,
            CreateTimelineCause::Load,
            idempotency.clone(),
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
        anyhow::ensure!(
            disk_consistent_lsn.is_valid(),
            "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
        );
        assert_eq!(
            disk_consistent_lsn,
            metadata.disk_consistent_lsn(),
            "these are used interchangeably"
        );

        timeline.remote_client.init_upload_queue(&index_part)?;

        timeline
            .load_layer_map(disk_consistent_lsn, index_part)
            .await
            .with_context(|| {
                format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
            })?;

        {
            // avoiding holding it across awaits
            let mut timelines_accessor = self.timelines.lock().unwrap();
            match timelines_accessor.entry(timeline_id) {
                // We should never try and load the same timeline twice during startup
                Entry::Occupied(_) => {
                    unreachable!(
                        "Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
                    );
                }
                Entry::Vacant(v) => {
                    v.insert(Arc::clone(&timeline));
                    timeline.maybe_spawn_flush_loop();
                }
            }
        };

        // Sanity check: a timeline should have some content.
        anyhow::ensure!(
            ancestor.is_some()
                || timeline
                    .layers
                    .read()
                    .await
                    .layer_map()
                    .expect("currently loading, layer manager cannot be shutdown already")
                    .iter_historic_layers()
                    .next()
                    .is_some(),
            "Timeline has no ancestor and no layer files"
        );

        Ok(())
    }

    /// Attach a tenant that's available in cloud storage.
    ///
    /// This returns quickly, after just creating the in-memory object
    /// Tenant struct and launching a background task to download
    /// the remote index files.  On return, the tenant is most likely still in
    /// Attaching state, and it will become Active once the background task
    /// finishes. You can use wait_until_active() to wait for the task to
    /// complete.
    ///
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn spawn(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        shard_identity: ShardIdentity,
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> Result<Arc<Tenant>, GlobalShutDown> {
        let wal_redo_manager =
            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;

        let TenantSharedResources {
            broker_client,
            remote_storage,
            deletion_queue_client,
            l0_flush_global_state,
        } = resources;

        let attach_mode = attached_conf.location.attach_mode;
        let generation = attached_conf.location.generation;

        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
            conf,
            attached_conf,
            shard_identity,
            Some(wal_redo_manager),
            tenant_shard_id,
            remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
        ));

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
        // we shut down while attaching.
        let attach_gate_guard = tenant
            .gate
            .enter()
            .expect("We just created the Tenant: nothing else can have shut it down yet");

        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            TaskKind::Attach,
            tenant_shard_id,
            None,
            "attach tenant",
            async move {

                info!(
                    ?attach_mode,
                    "Attaching tenant"
                );

                let _gate_guard = attach_gate_guard;

                // Is this tenant being spawned as part of process startup?
                let starting_up = init_order.is_some();
                scopeguard::defer! {
                    if starting_up {
                        TENANT.startup_complete.inc();
                    }
                }

                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
                enum BrokenVerbosity {
                    Error,
                    Info
                }
                let make_broken =
                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
                        match verbosity {
                            BrokenVerbosity::Info => {
                                info!("attach cancelled, setting tenant state to Broken: {err}");
                            },
                            BrokenVerbosity::Error => {
                                error!("attach failed, setting tenant state to Broken: {err:?}");
                            }
                        }
                        t.state.send_modify(|state| {
                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                            // if it errors, we will call make_broken when tenant is already in Stopping.
                            assert!(
                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
                                "the attach task owns the tenant state until activation is complete"
                            );

                            *state = TenantState::broken_from_reason(err.to_string());
                        });
                    };

                // TODO: should also be rejecting tenant conf changes that violate this check.
                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    return Ok(());
                }

                let mut init_order = init_order;
                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
                let _completion = init_order
                    .as_mut()
                    .and_then(|x| x.initial_tenant_load.take());
                let remote_load_completion = init_order
                    .as_mut()
                    .and_then(|x| x.initial_tenant_load_remote.take());

                enum AttachType<'a> {
                    /// We are attaching this tenant lazily in the background.
                    Warmup {
                        _permit: tokio::sync::SemaphorePermit<'a>,
                        during_startup: bool
                    },
                    /// We are attaching this tenant as soon as we can, because for example an
                    /// endpoint tried to access it.
                    OnDemand,
                    /// During normal operations after startup, we are attaching a tenant, and
                    /// eager attach was requested.
                    Normal,
                }

                let attach_type = if matches!(mode, SpawnMode::Lazy) {
                    // Before doing any I/O, wait for at least one of:
                    // - A client attempting to access to this tenant (on-demand loading)
                    // - A permit becoming available in the warmup semaphore (background warmup)

                    tokio::select!(
                        permit = tenant_clone.activate_now_sem.acquire() => {
                            let _ = permit.expect("activate_now_sem is never closed");
                            tracing::info!("Activating tenant (on-demand)");
                            AttachType::OnDemand
                        },
                        permit = conf.concurrent_tenant_warmup.inner().acquire() => {
                            let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
                            tracing::info!("Activating tenant (warmup)");
                            AttachType::Warmup {
                                _permit,
                                during_startup: init_order.is_some()
                            }
                        }
                        _ = tenant_clone.cancel.cancelled() => {
                            // This is safe, but should be pretty rare: it is interesting if a tenant
                            // stayed in Activating for such a long time that shutdown found it in
                            // that state.
                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
                            // just shutting down), but ensures progress.
                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
                            return Ok(());
                        },
                    )
                } else {
                    // SpawnMode::{Create,Eager} always cause jumping ahead of the
                    // concurrent_tenant_warmup queue
                    AttachType::Normal
                };

                let preload = match &mode {
                    SpawnMode::Eager | SpawnMode::Lazy => {
                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(&remote_storage, task_mgr::shutdown_token())
                            .await;
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                                return Ok(());
                            }
                        }
                    }

                };

                // Remote preload is complete.
                drop(remote_load_completion);


                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attach_start = std::time::Instant::now();
                let attached = {
                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
                let attach_duration = attach_start.elapsed();
                _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration));

                match attached {
                    Ok(()) => {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
                    }
                }

                // If we are doing an opportunistic warmup attachment at startup, initialize
                // logical size at the same time.  This is better than starting a bunch of idle tenants
                // with cold caches and then coming back later to initialize their logical sizes.
                //
                // It also prevents the warmup proccess competing with the concurrency limit on
                // logical size calculations: if logical size calculation semaphore is saturated,
                // then warmup will wait for that before proceeding to the next tenant.
                if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
                    let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
                    tracing::info!("Waiting for initial logical sizes while warming up...");
                    while futs.next().await.is_some() {}
                    tracing::info!("Warm-up complete");
                }

                Ok(())
            }
            .instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
        );
        Ok(tenant)
    }

    #[instrument(skip_all)]
    pub(crate) async fn preload(
        self: &Arc<Self>,
        remote_storage: &GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> anyhow::Result<TenantPreload> {
        span::debug_assert_current_span_has_tenant_id();
        // Get list of remote timelines
        // download index files for every tenant timeline
        info!("listing remote timelines");
        let (remote_timeline_ids, other_keys) = remote_timeline_client::list_remote_timelines(
            remote_storage,
            self.tenant_shard_id,
            cancel.clone(),
        )
        .await?;
        let (offloaded_add, tenant_manifest) =
            match remote_timeline_client::download_tenant_manifest(
                remote_storage,
                &self.tenant_shard_id,
                self.generation,
                &cancel,
            )
            .await
            {
                Ok((tenant_manifest, _generation, _manifest_mtime)) => (
                    format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
                    tenant_manifest,
                ),
                Err(DownloadError::NotFound) => {
                    ("no manifest".to_string(), TenantManifest::empty())
                }
                Err(e) => Err(e)?,
            };

        info!(
            "found {} timelines, and {offloaded_add}",
            remote_timeline_ids.len()
        );

        for k in other_keys {
            warn!("Unexpected non timeline key {k}");
        }

        Ok(TenantPreload {
            tenant_manifest,
            timelines: self
                .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
                .await?,
        })
    }

    ///
    /// Background task that downloads all data for a tenant and brings it to Active state.
    ///
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(
        self: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        failpoint_support::sleep_millis_async!("before-attaching-tenant");

        let Some(preload) = preload else {
            anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
        };

        let mut offloaded_timeline_ids = HashSet::new();
        let mut offloaded_timelines_list = Vec::new();
        for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
            let timeline_id = timeline_manifest.timeline_id;
            let offloaded_timeline =
                OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
            offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
            offloaded_timeline_ids.insert(timeline_id);
        }

        let mut timelines_to_resume_deletions = vec![];

        let mut remote_index_and_client = HashMap::new();
        let mut timeline_ancestors = HashMap::new();
        let mut existent_timelines = HashSet::new();
        for (timeline_id, preload) in preload.timelines {
            if offloaded_timeline_ids.remove(&timeline_id) {
                // The timeline is offloaded, skip loading it.
                continue;
            }
            let index_part = match preload.index_part {
                Ok(i) => {
                    debug!("remote index part exists for timeline {timeline_id}");
                    // We found index_part on the remote, this is the standard case.
                    existent_timelines.insert(timeline_id);
                    i
                }
                Err(DownloadError::NotFound) => {
                    // There is no index_part on the remote. We only get here
                    // if there is some prefix for the timeline in the remote storage.
                    // This can e.g. be the initdb.tar.zst archive, maybe a
                    // remnant from a prior incomplete creation or deletion attempt.
                    // Delete the local directory as the deciding criterion for a
                    // timeline's existence is presence of index_part.
                    info!(%timeline_id, "index_part not found on remote");
                    continue;
                }
                Err(e) => {
                    // Some (possibly ephemeral) error happened during index_part download.
                    // Pretend the timeline exists to not delete the timeline directory,
                    // as it might be a temporary issue and we don't want to re-download
                    // everything after it resolves.
                    warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");

                    existent_timelines.insert(timeline_id);
                    continue;
                }
            };
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
                    remote_index_and_client.insert(timeline_id, (index_part, preload.client));
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
                    info!(
                        "timeline {} is deleted, picking to resume deletion",
                        timeline_id
                    );
                    timelines_to_resume_deletions.push((timeline_id, index_part, preload.client));
                }
            }
        }

        let mut gc_blocks = HashMap::new();

        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
        for (timeline_id, remote_metadata) in sorted_timelines {
            let (index_part, remote_client) = remote_index_and_client
                .remove(&timeline_id)
                .expect("just put it in above");

            if let Some(blocking) = index_part.gc_blocking.as_ref() {
                // could just filter these away, but it helps while testing
                anyhow::ensure!(
                    !blocking.reasons.is_empty(),
                    "index_part for {timeline_id} is malformed: it should not have gc blocking with zero reasons"
                );
                let prev = gc_blocks.insert(timeline_id, blocking.reasons);
                assert!(prev.is_none());
            }

            // TODO again handle early failure
            self.load_remote_timeline(
                timeline_id,
                index_part,
                remote_metadata,
                TimelineResources {
                    remote_client,
                    timeline_get_throttle: self.timeline_get_throttle.clone(),
                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                },
                ctx,
            )
            .await
            .with_context(|| {
                format!(
                    "failed to load remote timeline {} for tenant {}",
                    timeline_id, self.tenant_shard_id
                )
            })?;
        }

        // Walk through deleted timelines, resume deletion
        for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
            remote_timeline_client
                .init_upload_queue_stopped_to_continue_deletion(&index_part)
                .context("init queue stopped")
                .map_err(LoadLocalTimelineError::ResumeDeletion)?;

            DeleteTimelineFlow::resume_deletion(
                Arc::clone(self),
                timeline_id,
                &index_part.metadata,
                remote_timeline_client,
            )
            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
            .context("resume_deletion")
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }
        // Complete deletions for offloaded timeline id's.
        offloaded_timelines_list
            .retain(|(offloaded_id, _offloaded)| {
                // At this point, offloaded_timeline_ids has the list of all offloaded timelines
                // without a prefix in S3, so they are inexistent.
                // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
                // If there is a dangling reference in another location, they need to be cleaned up.
                let delete = offloaded_timeline_ids.contains(offloaded_id);
                if delete {
                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
                }
                !delete
        });
        if !offloaded_timelines_list.is_empty() {
            tracing::info!(
                "Tenant has {} offloaded timelines",
                offloaded_timelines_list.len()
            );
        }
        {
            let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
            offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
        }
        if !offloaded_timeline_ids.is_empty() {
            self.store_tenant_manifest().await?;
        }

        // The local filesystem contents are a cache of what's in the remote IndexPart;
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

        self.gc_block.set_scanned(gc_blocks);

        fail::fail_point!("attach-before-activate", |_| {
            anyhow::bail!("attach-before-activate");
        });
        failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);

        info!("Done");

        Ok(())
    }

    /// Check for any local timeline directories that are temporary, or do not correspond to a
    /// timeline that still exists: this can happen if we crashed during a deletion/creation, or
    /// if a timeline was deleted while the tenant was attached to a different pageserver.
    fn clean_up_timelines(&self, existent_timelines: &HashSet<TimelineId>) -> anyhow::Result<()> {
        let timelines_dir = self.conf.timelines_path(&self.tenant_shard_id);

        let entries = match timelines_dir.read_dir_utf8() {
            Ok(d) => d,
            Err(e) => {
                if e.kind() == std::io::ErrorKind::NotFound {
                    return Ok(());
                } else {
                    return Err(e).context("list timelines directory for tenant");
                }
            }
        };

        for entry in entries {
            let entry = entry.context("read timeline dir entry")?;
            let entry_path = entry.path();

            let purge = if crate::is_temporary(entry_path)
                // TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
                || is_uninit_mark(entry_path)
                || crate::is_delete_mark(entry_path)
            {
                true
            } else {
                match TimelineId::try_from(entry_path.file_name()) {
                    Ok(i) => {
                        // Purge if the timeline ID does not exist in remote storage: remote storage is the authority.
                        !existent_timelines.contains(&i)
                    }
                    Err(e) => {
                        tracing::warn!(
                            "Unparseable directory in timelines directory: {entry_path}, ignoring ({e})"
                        );
                        // Do not purge junk: if we don't recognize it, be cautious and leave it for a human.
                        false
                    }
                }
            };

            if purge {
                tracing::info!("Purging stale timeline dentry {entry_path}");
                if let Err(e) = match entry.file_type() {
                    Ok(t) => if t.is_dir() {
                        std::fs::remove_dir_all(entry_path)
                    } else {
                        std::fs::remove_file(entry_path)
                    }
                    .or_else(fs_ext::ignore_not_found),
                    Err(e) => Err(e),
                } {
                    tracing::warn!("Failed to purge stale timeline dentry {entry_path}: {e}");
                }
            }
        }

        Ok(())
    }

    /// Get sum of all remote timelines sizes
    ///
    /// This function relies on the index_part instead of listing the remote storage
    pub fn remote_size(&self) -> u64 {
        let mut size = 0;

        for timeline in self.list_timelines() {
            size += timeline.remote_client.get_remote_physical_size();
        }

        size
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
    async fn load_remote_timeline(
        &self,
        timeline_id: TimelineId,
        index_part: IndexPart,
        remote_metadata: TimelineMetadata,
        resources: TimelineResources,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        info!("downloading index file for timeline {}", timeline_id);
        tokio::fs::create_dir_all(self.conf.timeline_path(&self.tenant_shard_id, &timeline_id))
            .await
            .context("Failed to create new timeline directory")?;

        let ancestor = if let Some(ancestor_id) = remote_metadata.ancestor_timeline() {
            let timelines = self.timelines.lock().unwrap();
            Some(Arc::clone(timelines.get(&ancestor_id).ok_or_else(
                || {
                    anyhow::anyhow!(
                        "cannot find ancestor timeline {ancestor_id} for timeline {timeline_id}"
                    )
                },
            )?))
        } else {
            None
        };

        self.timeline_init_and_sync(
            timeline_id,
            resources,
            index_part,
            remote_metadata,
            ancestor,
            ctx,
        )
        .await
    }

    async fn load_timelines_metadata(
        self: &Arc<Tenant>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
        let mut part_downloads = JoinSet::new();
        for timeline_id in timeline_ids {
            let cancel_clone = cancel.clone();
            part_downloads.spawn(
                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
                    .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }

        let mut timeline_preloads: HashMap<TimelineId, TimelinePreload> = HashMap::new();

        loop {
            tokio::select!(
                next = part_downloads.join_next() => {
                    match next {
                        Some(result) => {
                            let preload = result.context("join preload task")?;
                            timeline_preloads.insert(preload.timeline_id, preload);
                        },
                        None => {
                            break;
                        }
                    }
                },
                _ = cancel.cancelled() => {
                    anyhow::bail!("Cancelled while waiting for remote index download")
                }
            )
        }

        Ok(timeline_preloads)
    }

    fn build_timeline_client(
        &self,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
    ) -> RemoteTimelineClient {
        RemoteTimelineClient::new(
            remote_storage.clone(),
            self.deletion_queue_client.clone(),
            self.conf,
            self.tenant_shard_id,
            timeline_id,
            self.generation,
        )
    }

    fn load_timeline_metadata(
        self: &Arc<Tenant>,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
        cancel: CancellationToken,
    ) -> impl Future<Output = TimelinePreload> {
        let client = self.build_timeline_client(timeline_id, remote_storage);
        async move {
            debug_assert_current_span_has_tenant_and_timeline_id();
            debug!("starting index part download");

            let index_part = client.download_index_file(&cancel).await;

            debug!("finished index part download");

            TimelinePreload {
                client,
                timeline_id,
                index_part,
            }
        }
    }

    fn check_to_be_archived_has_no_unarchived_children(
        timeline_id: TimelineId,
        timelines: &std::sync::MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>>,
    ) -> Result<(), TimelineArchivalError> {
        let children: Vec<TimelineId> = timelines
            .iter()
            .filter_map(|(id, entry)| {
                if entry.get_ancestor_timeline_id() != Some(timeline_id) {
                    return None;
                }
                if entry.is_archived() == Some(true) {
                    return None;
                }
                Some(*id)
            })
            .collect();

        if !children.is_empty() {
            return Err(TimelineArchivalError::HasUnarchivedChildren(children));
        }
        Ok(())
    }

    fn check_ancestor_of_to_be_unarchived_is_not_archived(
        ancestor_timeline_id: TimelineId,
        timelines: &std::sync::MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>>,
        offloaded_timelines: &std::sync::MutexGuard<
            '_,
            HashMap<TimelineId, Arc<OffloadedTimeline>>,
        >,
    ) -> Result<(), TimelineArchivalError> {
        let has_archived_parent =
            if let Some(ancestor_timeline) = timelines.get(&ancestor_timeline_id) {
                ancestor_timeline.is_archived() == Some(true)
            } else if offloaded_timelines.contains_key(&ancestor_timeline_id) {
                true
            } else {
                error!("ancestor timeline {ancestor_timeline_id} not found");
                if cfg!(debug_assertions) {
                    panic!("ancestor timeline {ancestor_timeline_id} not found");
                }
                return Err(TimelineArchivalError::NotFound);
            };
        if has_archived_parent {
            return Err(TimelineArchivalError::HasArchivedParent(
                ancestor_timeline_id,
            ));
        }
        Ok(())
    }

    fn check_to_be_unarchived_timeline_has_no_archived_parent(
        timeline: &Arc<Timeline>,
    ) -> Result<(), TimelineArchivalError> {
        if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
            if ancestor_timeline.is_archived() == Some(true) {
                return Err(TimelineArchivalError::HasArchivedParent(
                    ancestor_timeline.timeline_id,
                ));
            }
        }
        Ok(())
    }

    /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
    ///
    /// Counterpart to [`offload_timeline`].
    async fn unoffload_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: RequestContext,
    ) -> Result<Arc<Timeline>, TimelineArchivalError> {
        info!("unoffloading timeline");

        // We activate the timeline below manually, so this must be called on an active timeline.
        // We expect callers of this function to ensure this.
        match self.current_state() {
            TenantState::Activating { .. }
            | TenantState::Attaching
            | TenantState::Broken { .. } => {
                panic!("Timeline expected to be active")
            }
            TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
            TenantState::Active => {}
        }
        let cancel = self.cancel.clone();

        // Protect against concurrent attempts to use this TimelineId
        // We don't care much about idempotency, as it's ensured a layer above.
        let allow_offloaded = true;
        let _create_guard = self
            .create_timeline_create_guard(
                timeline_id,
                CreateTimelineIdempotency::FailWithConflict,
                allow_offloaded,
            )
            .map_err(|err| match err {
                TimelineExclusionError::AlreadyCreating => TimelineArchivalError::AlreadyInProgress,
                TimelineExclusionError::AlreadyExists { .. } => {
                    TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
                }
                TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
            })?;

        let timeline_preload = self
            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
            .await;

        let index_part = match timeline_preload.index_part {
            Ok(index_part) => {
                debug!("remote index part exists for timeline {timeline_id}");
                index_part
            }
            Err(DownloadError::NotFound) => {
                error!(%timeline_id, "index_part not found on remote");
                return Err(TimelineArchivalError::NotFound);
            }
            Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled),
            Err(e) => {
                // Some (possibly ephemeral) error happened during index_part download.
                warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
                return Err(TimelineArchivalError::Other(
                    anyhow::Error::new(e).context("downloading index_part from remote storage"),
                ));
            }
        };
        let index_part = match index_part {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_index_part) => {
                info!("timeline is deleted according to index_part.json");
                return Err(TimelineArchivalError::NotFound);
            }
        };
        let remote_metadata = index_part.metadata.clone();
        let timeline_resources = self.build_timeline_resources(timeline_id);
        self.load_remote_timeline(
            timeline_id,
            index_part,
            remote_metadata,
            timeline_resources,
            &ctx,
        )
        .await
        .with_context(|| {
            format!(
                "failed to load remote timeline {} for tenant {}",
                timeline_id, self.tenant_shard_id
            )
        })
        .map_err(TimelineArchivalError::Other)?;

        let timeline = {
            let timelines = self.timelines.lock().unwrap();
            let Some(timeline) = timelines.get(&timeline_id) else {
                warn!("timeline not available directly after attach");
                // This is not a panic because no locks are held between `load_remote_timeline`
                // which puts the timeline into timelines, and our look into the timeline map.
                return Err(TimelineArchivalError::Other(anyhow::anyhow!(
                    "timeline not available directly after attach"
                )));
            };
            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
            if offloaded_timelines.remove(&timeline_id).is_none() {
                warn!("timeline already removed from offloaded timelines");
            }
            Arc::clone(timeline)
        };

        // Upload new list of offloaded timelines to S3
        self.store_tenant_manifest().await?;

        // Activate the timeline (if it makes sense)
        if !(timeline.is_broken() || timeline.is_stopping()) {
            let background_jobs_can_start = None;
            timeline.activate(
                self.clone(),
                broker_client.clone(),
                background_jobs_can_start,
                &ctx,
            );
        }

        info!("timeline unoffloading complete");
        Ok(timeline)
    }

    pub(crate) async fn apply_timeline_archival_config(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        new_state: TimelineArchivalState,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: RequestContext,
    ) -> Result<(), TimelineArchivalError> {
        info!("setting timeline archival config");
        // First part: figure out what is needed to do, and do validation
        let timeline_or_unarchive_offloaded = 'outer: {
            let timelines = self.timelines.lock().unwrap();

            let Some(timeline) = timelines.get(&timeline_id) else {
                let offloaded_timelines = self.timelines_offloaded.lock().unwrap();
                let Some(offloaded) = offloaded_timelines.get(&timeline_id) else {
                    return Err(TimelineArchivalError::NotFound);
                };
                if new_state == TimelineArchivalState::Archived {
                    // It's offloaded already, so nothing to do
                    return Ok(());
                }
                if let Some(ancestor_timeline_id) = offloaded.ancestor_timeline_id {
                    Self::check_ancestor_of_to_be_unarchived_is_not_archived(
                        ancestor_timeline_id,
                        &timelines,
                        &offloaded_timelines,
                    )?;
                }
                break 'outer None;
            };

            // Do some validation. We release the timelines lock below, so there is potential
            // for race conditions: these checks are more present to prevent misunderstandings of
            // the API's capabilities, instead of serving as the sole way to defend their invariants.
            match new_state {
                TimelineArchivalState::Unarchived => {
                    Self::check_to_be_unarchived_timeline_has_no_archived_parent(timeline)?
                }
                TimelineArchivalState::Archived => {
                    Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)?
                }
            }
            Some(Arc::clone(timeline))
        };

        // Second part: unoffload timeline (if needed)
        let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded {
            timeline
        } else {
            // Turn offloaded timeline into a non-offloaded one
            self.unoffload_timeline(timeline_id, broker_client, ctx)
                .await?
        };

        // Third part: upload new timeline archival state and block until it is present in S3
        let upload_needed = match timeline
            .remote_client
            .schedule_index_upload_for_timeline_archival_state(new_state)
        {
            Ok(upload_needed) => upload_needed,
            Err(e) => {
                if timeline.cancel.is_cancelled() {
                    return Err(TimelineArchivalError::Cancelled);
                } else {
                    return Err(TimelineArchivalError::Other(e));
                }
            }
        };

        if upload_needed {
            info!("Uploading new state");
            const MAX_WAIT: Duration = Duration::from_secs(10);
            let Ok(v) =
                tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
            else {
                tracing::warn!("reached timeout for waiting on upload queue");
                return Err(TimelineArchivalError::Timeout);
            };
            v.map_err(|e| match e {
                WaitCompletionError::NotInitialized(e) => {
                    TimelineArchivalError::Other(anyhow::anyhow!(e))
                }
                WaitCompletionError::UploadQueueShutDownOrStopped => {
                    TimelineArchivalError::Cancelled
                }
            })?;
        }
        Ok(())
    }

    pub fn get_offloaded_timeline(
        &self,
        timeline_id: TimelineId,
    ) -> Result<Arc<OffloadedTimeline>, GetTimelineError> {
        self.timelines_offloaded
            .lock()
            .unwrap()
            .get(&timeline_id)
            .map(Arc::clone)
            .ok_or(GetTimelineError::NotFound {
                tenant_id: self.tenant_shard_id,
                timeline_id,
            })
    }

    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }

    /// Get Timeline handle for given Neon timeline ID.
    /// This function is idempotent. It doesn't change internal state in any way.
    pub fn get_timeline(
        &self,
        timeline_id: TimelineId,
        active_only: bool,
    ) -> Result<Arc<Timeline>, GetTimelineError> {
        let timelines_accessor = self.timelines.lock().unwrap();
        let timeline = timelines_accessor
            .get(&timeline_id)
            .ok_or(GetTimelineError::NotFound {
                tenant_id: self.tenant_shard_id,
                timeline_id,
            })?;

        if active_only && !timeline.is_active() {
            Err(GetTimelineError::NotActive {
                tenant_id: self.tenant_shard_id,
                timeline_id,
                state: timeline.current_state(),
            })
        } else {
            Ok(Arc::clone(timeline))
        }
    }

    /// Lists timelines the tenant contains.
    /// It's up to callers to omit certain timelines that are not considered ready for use.
    pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
        self.timelines
            .lock()
            .unwrap()
            .values()
            .map(Arc::clone)
            .collect()
    }

    /// Lists timelines the tenant manages, including offloaded ones.
    ///
    /// It's up to callers to omit certain timelines that are not considered ready for use.
    pub fn list_timelines_and_offloaded(
        &self,
    ) -> (Vec<Arc<Timeline>>, Vec<Arc<OffloadedTimeline>>) {
        let timelines = self
            .timelines
            .lock()
            .unwrap()
            .values()
            .map(Arc::clone)
            .collect();
        let offloaded = self
            .timelines_offloaded
            .lock()
            .unwrap()
            .values()
            .map(Arc::clone)
            .collect();
        (timelines, offloaded)
    }

    pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
        self.timelines.lock().unwrap().keys().cloned().collect()
    }

    /// This is used by tests & import-from-basebackup.
    ///
    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
    ///
    /// The caller is responsible for getting the timeline into a state that will be accepted
    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
    /// to the [`Tenant::timelines`].
    ///
    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
        _ctx: &RequestContext,
    ) -> anyhow::Result<UninitializedTimeline> {
        anyhow::ensure!(
            self.is_active(),
            "Cannot create empty timelines on inactive tenant"
        );

        // Protect against concurrent attempts to use this TimelineId
        let create_guard = match self
            .start_creating_timeline(new_timeline_id, CreateTimelineIdempotency::FailWithConflict)
            .await?
        {
            StartCreatingTimelineResult::CreateGuard(guard) => guard,
            StartCreatingTimelineResult::Idempotent(_) => {
                unreachable!("FailWithConflict implies we get an error instead")
            }
        };

        let new_metadata = TimelineMetadata::new(
            // Initialize disk_consistent LSN to 0, The caller must import some data to
            // make it valid, before calling finish_creation()
            Lsn(0),
            None,
            None,
            Lsn(0),
            initdb_lsn,
            initdb_lsn,
            pg_version,
        );
        self.prepare_new_timeline(
            new_timeline_id,
            &new_metadata,
            create_guard,
            initdb_lsn,
            None,
        )
        .await
    }

    /// Helper for unit tests to create an empty timeline.
    ///
    /// The timeline is has state value `Active` but its background loops are not running.
    // This makes the various functions which anyhow::ensure! for Active state work in tests.
    // Our current tests don't need the background loops.
    #[cfg(test)]
    pub async fn create_test_timeline(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let uninit_tl = self
            .create_empty_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
            .await?;
        let tline = uninit_tl.raw_timeline().expect("we just created it");
        assert_eq!(tline.get_last_record_lsn(), Lsn(0));

        // Setup minimum keys required for the timeline to be usable.
        let mut modification = tline.begin_modification(initdb_lsn);
        modification
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
            .commit(ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

        // Flush to disk so that uninit_tl's check for valid disk_consistent_lsn passes.
        tline.maybe_spawn_flush_loop();
        tline.freeze_and_flush().await.context("freeze_and_flush")?;

        // Make sure the freeze_and_flush reaches remote storage.
        tline.remote_client.wait_completion().await.unwrap();

        let tl = uninit_tl.finish_creation()?;
        // The non-test code would call tl.activate() here.
        tl.set_state(TimelineState::Active);
        Ok(tl)
    }

    /// Helper for unit tests to create a timeline with some pre-loaded states.
    #[cfg(test)]
    #[allow(clippy::too_many_arguments)]
    pub async fn create_test_timeline_with_layers(
        &self,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
        pg_version: u32,
        ctx: &RequestContext,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
        use checks::check_valid_layermap;
        use itertools::Itertools;

        let tline = self
            .create_test_timeline(new_timeline_id, initdb_lsn, pg_version, ctx)
            .await?;
        tline.force_advance_lsn(end_lsn);
        for deltas in delta_layer_desc {
            tline
                .force_create_delta_layer(deltas, Some(initdb_lsn), ctx)
                .await?;
        }
        for (lsn, images) in image_layer_desc {
            tline
                .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                .await?;
        }
        let layer_names = tline
            .layers
            .read()
            .await
            .layer_map()
            .unwrap()
            .iter_historic_layers()
            .map(|layer| layer.layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
            bail!("invalid layermap: {err}");
        }
        Ok(tline)
    }

    /// Create a new timeline.
    ///
    /// Returns the new timeline ID and reference to its Timeline object.
    ///
    /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
        self: &Arc<Tenant>,
        params: CreateTimelineParams,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        if !self.is_active() {
            if matches!(self.current_state(), TenantState::Stopping { .. }) {
                return Err(CreateTimelineError::ShuttingDown);
            } else {
                return Err(CreateTimelineError::Other(anyhow::anyhow!(
                    "Cannot create timelines on inactive tenant"
                )));
            }
        }

        let _gate = self
            .gate
            .enter()
            .map_err(|_| CreateTimelineError::ShuttingDown)?;

        let result: CreateTimelineResult = match params {
            CreateTimelineParams::Bootstrap(CreateTimelineParamsBootstrap {
                new_timeline_id,
                existing_initdb_timeline_id,
                pg_version,
            }) => {
                self.bootstrap_timeline(
                    new_timeline_id,
                    pg_version,
                    existing_initdb_timeline_id,
                    ctx,
                )
                .await?
            }
            CreateTimelineParams::Branch(CreateTimelineParamsBranch {
                new_timeline_id,
                ancestor_timeline_id,
                mut ancestor_start_lsn,
            }) => {
                let ancestor_timeline = self
                    .get_timeline(ancestor_timeline_id, false)
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

                // instead of waiting around, just deny the request because ancestor is not yet
                // ready for other purposes either.
                if !ancestor_timeline.is_active() {
                    return Err(CreateTimelineError::AncestorNotActive);
                }

                if ancestor_timeline.is_archived() == Some(true) {
                    info!("tried to branch archived timeline");
                    return Err(CreateTimelineError::AncestorArchived);
                }

                if let Some(lsn) = ancestor_start_lsn.as_mut() {
                    *lsn = lsn.align();

                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
                        // can we safely just branch from the ancestor instead?
                        return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                            "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
                            lsn,
                            ancestor_timeline_id,
                            ancestor_ancestor_lsn,
                        )));
                    }

                    // Wait for the WAL to arrive and be processed on the parent branch up
                    // to the requested branch point. The repository code itself doesn't
                    // require it, but if we start to receive WAL on the new timeline,
                    // decoding the new WAL might need to look up previous pages, relation
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
                    ancestor_timeline
                        .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx)
                        .await
                        .map_err(|e| match e {
                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState { .. }) => {
                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
                            }
                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
                        })?;
                }

                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
                    .await?
            }
        };

        // At this point we have dropped our guard on [`Self::timelines_creating`], and
        // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
        // not send a success to the caller until it is.  The same applies to idempotent retries.
        //
        // TODO: the timeline is already visible in [`Self::timelines`]; a caller could incorrectly
        // assume that, because they can see the timeline via API, that the creation is done and
        // that it is durable. Ideally, we would keep the timeline hidden (in [`Self::timelines_creating`])
        // until it is durable, e.g., by extending the time we hold the creation guard. This also
        // interacts with UninitializedTimeline and is generally a bit tricky.
        //
        // To re-emphasize: the only correct way to create a timeline is to repeat calling the
        // creation API until it returns success. Only then is durability guaranteed.
        info!(creation_result=%result.discriminant(), "waiting for timeline to be durable");
        result
            .timeline()
            .remote_client
            .wait_completion()
            .await
            .context("wait for timeline initial uploads to complete")?;

        // The creating task is responsible for activating the timeline.
        // We do this after `wait_completion()` so that we don't spin up tasks that start
        // doing stuff before the IndexPart is durable in S3, which is done by the previous section.
        let activated_timeline = match result {
            CreateTimelineResult::Created(timeline) => {
                timeline.activate(self.clone(), broker_client, None, ctx);
                timeline
            }
            CreateTimelineResult::Idempotent(timeline) => {
                info!(
                    "request was deemed idempotent, activation will be done by the creating task"
                );
                timeline
            }
        };

        Ok(activated_timeline)
    }

    pub(crate) async fn delete_timeline(
        self: Arc<Self>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
        DeleteTimelineFlow::run(&self, timeline_id).await?;

        Ok(())
    }

    /// perform one garbage collection iteration, removing old data files from disk.
    /// this function is periodically called by gc task.
    /// also it can be explicitly requested through page server api 'do_gc' command.
    ///
    /// `target_timeline_id` specifies the timeline to GC, or None for all.
    ///
    /// The `horizon` an `pitr` parameters determine how much WAL history needs to be retained.
    /// Also known as the retention period, or the GC cutoff point. `horizon` specifies
    /// the amount of history, as LSN difference from current latest LSN on each timeline.
    /// `pitr` specifies the same as a time difference from the current time. The effective
    /// GC cutoff point is determined conservatively by either `horizon` and `pitr`, whichever
    /// requires more history to be retained.
    //
    pub(crate) async fn gc_iteration(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<GcResult, GcError> {
        // Don't start doing work during shutdown
        if let TenantState::Stopping { .. } = self.current_state() {
            return Ok(GcResult::default());
        }

        // there is a global allowed_error for this
        if !self.is_active() {
            return Err(GcError::NotActive);
        }

        {
            let conf = self.tenant_conf.load();

            if !conf.location.may_delete_layers_hint() {
                info!("Skipping GC in location state {:?}", conf.location);
                return Ok(GcResult::default());
            }

            if conf.is_gc_blocked_by_lsn_lease_deadline() {
                info!("Skipping GC because lsn lease deadline is not reached");
                return Ok(GcResult::default());
            }
        }

        let _guard = match self.gc_block.start().await {
            Ok(guard) => guard,
            Err(reasons) => {
                info!("Skipping GC: {reasons}");
                return Ok(GcResult::default());
            }
        };

        self.gc_iteration_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }

    /// Perform one compaction iteration.
    /// This function is periodically called by compactor task.
    /// Also it can be explicitly requested per timeline through page server
    /// api's 'compact' command.
    ///
    /// Returns whether we have pending compaction task.
    async fn compaction_iteration(
        self: &Arc<Self>,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<bool, timeline::CompactionError> {
        // Don't start doing work during shutdown, or when broken, we do not need those in the logs
        if !self.is_active() {
            return Ok(false);
        }

        {
            let conf = self.tenant_conf.load();
            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
                info!("Skipping compaction in location state {:?}", conf.location);
                return Ok(false);
            }
        }

        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
        // compactions.  We don't want to block everything else while the
        // compaction runs.
        let timelines_to_compact_or_offload;
        {
            let timelines = self.timelines.lock().unwrap();
            timelines_to_compact_or_offload = timelines
                .iter()
                .filter_map(|(timeline_id, timeline)| {
                    let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
                    let has_no_unoffloaded_children = {
                        !timelines
                            .iter()
                            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
                    };
                    let can_offload =
                        can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
                    if (is_active, can_offload) == (false, false) {
                        None
                    } else {
                        Some((*timeline_id, timeline.clone(), (is_active, can_offload)))
                    }
                })
                .collect::<Vec<_>>();
            drop(timelines);
        }

        // Before doing any I/O work, check our circuit breaker
        if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
            info!("Skipping compaction due to previous failures");
            return Ok(false);
        }

        let mut has_pending_task = false;

        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
        {
            let pending_task_left = if *can_compact {
                Some(
                    timeline
                        .compact(cancel, EnumSet::empty(), ctx)
                        .instrument(info_span!("compact_timeline", %timeline_id))
                        .await
                        .inspect_err(|e| match e {
                            timeline::CompactionError::ShuttingDown => (),
                            timeline::CompactionError::Other(e) => {
                                self.compaction_circuit_breaker
                                    .lock()
                                    .unwrap()
                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
                            }
                        })?,
                )
            } else {
                None
            };
            has_pending_task |= pending_task_left.unwrap_or(false);
            if pending_task_left == Some(false) && *can_offload {
                offload_timeline(self, timeline)
                    .instrument(info_span!("offload_timeline", %timeline_id))
                    .await
                    .map_err(timeline::CompactionError::Other)?;
            }
        }

        self.compaction_circuit_breaker
            .lock()
            .unwrap()
            .success(&CIRCUIT_BREAKERS_UNBROKEN);

        Ok(has_pending_task)
    }

    // Call through to all timelines to freeze ephemeral layers if needed.  Usually
    // this happens during ingest: this background housekeeping is for freezing layers
    // that are open but haven't been written to for some time.
    async fn ingest_housekeeping(&self) {
        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
        // compactions.  We don't want to block everything else while the
        // compaction runs.
        let timelines = {
            self.timelines
                .lock()
                .unwrap()
                .values()
                .filter_map(|timeline| {
                    if timeline.is_active() {
                        Some(timeline.clone())
                    } else {
                        None
                    }
                })
                .collect::<Vec<_>>()
        };

        for timeline in &timelines {
            timeline.maybe_freeze_ephemeral_layer().await;
        }
    }

    pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
        let timelines = self.timelines.lock().unwrap();
        !timelines
            .iter()
            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id))
    }

    pub fn current_state(&self) -> TenantState {
        self.state.borrow().clone()
    }

    pub fn is_active(&self) -> bool {
        self.current_state() == TenantState::Active
    }

    pub fn generation(&self) -> Generation {
        self.generation
    }

    pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
        self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
    }

    /// Changes tenant status to active, unless shutdown was already requested.
    ///
    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
    /// to delay background jobs. Background jobs can be started right away when None is given.
    fn activate(
        self: &Arc<Self>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
        span::debug_assert_current_span_has_tenant_id();

        let mut activating = false;
        self.state.send_modify(|current_state| {
            use pageserver_api::models::ActivatingFrom;
            match &*current_state {
                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                }
                TenantState::Attaching => {
                    *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                }
            }
            debug!(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), "Activating tenant");
            activating = true;
            // Continue outside the closure. We need to grab timelines.lock()
            // and we plan to turn it into a tokio::sync::Mutex in a future patch.
        });

        if activating {
            let timelines_accessor = self.timelines.lock().unwrap();
            let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap();
            let timelines_to_activate = timelines_accessor
                .values()
                .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));

            // Before activation, populate each Timeline's GcInfo with information about its children
            self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor);

            // Spawn gc and compaction loops. The loops will shut themselves
            // down when they notice that the tenant is inactive.
            tasks::start_background_loops(self, background_jobs_can_start);

            let mut activated_timelines = 0;

            for timeline in timelines_to_activate {
                timeline.activate(
                    self.clone(),
                    broker_client.clone(),
                    background_jobs_can_start,
                    ctx,
                );
                activated_timelines += 1;
            }

            self.state.send_modify(move |current_state| {
                assert!(
                    matches!(current_state, TenantState::Activating(_)),
                    "set_stopping and set_broken wait for us to leave Activating state",
                );
                *current_state = TenantState::Active;

                let elapsed = self.constructed_at.elapsed();
                let total_timelines = timelines_accessor.len();

                // log a lot of stuff, because some tenants sometimes suffer from user-visible
                // times to activate. see https://github.com/neondatabase/neon/issues/4025
                info!(
                    since_creation_millis = elapsed.as_millis(),
                    tenant_id = %self.tenant_shard_id.tenant_id,
                    shard_id = %self.tenant_shard_id.shard_slug(),
                    activated_timelines,
                    total_timelines,
                    post_state = <&'static str>::from(&*current_state),
                    "activation attempt finished"
                );

                TENANT.activation.observe(elapsed.as_secs_f64());
            });
        }
    }

    /// Shutdown the tenant and join all of the spawned tasks.
    ///
    /// The method caters for all use-cases:
    /// - pageserver shutdown (freeze_and_flush == true)
    /// - detach + ignore (freeze_and_flush == false)
    ///
    /// This will attempt to shutdown even if tenant is broken.
    ///
    /// `shutdown_progress` is a [`completion::Barrier`] for the shutdown initiated by this call.
    /// If the tenant is already shutting down, we return a clone of the first shutdown call's
    /// `Barrier` as an `Err`. This not-first caller can use the returned barrier to join with
    /// the ongoing shutdown.
    async fn shutdown(
        &self,
        shutdown_progress: completion::Barrier,
        shutdown_mode: timeline::ShutdownMode,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();

        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
        // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
        //
        // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
        // 1. Lock out any new requests to the tenants.
        // 2. Signal cancellation to WAL receivers (we wait on it below).
        // 3. Signal cancellation for other tenant background loops.
        // 4. ???
        //
        // The waiting for the cancellation is not done uniformly.
        // We certainly wait for WAL receivers to shut down.
        // That is necessary so that no new data comes in before the freeze_and_flush.
        // But the tenant background loops are joined-on in our caller.
        // It's mesed up.
        // we just ignore the failure to stop

        // If we're still attaching, fire the cancellation token early to drop out: this
        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
        // is very slow.
        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
            self.cancel.cancel();

            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
            // are children of ours, so their flush loops will have shut down already
            timeline::ShutdownMode::Hard
        } else {
            shutdown_mode
        };

        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
            }
            Err(SetStoppingError::AlreadyStopping(other)) => {
                // give caller the option to wait for this this shutdown
                info!("Tenant::shutdown: AlreadyStopping");
                return Err(other);
            }
        };

        let mut js = tokio::task::JoinSet::new();
        {
            let timelines = self.timelines.lock().unwrap();
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let timeline_id = timeline.timeline_id;
                let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode);
                js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await });
            })
        };
        // test_long_timeline_create_then_tenant_delete is leaning on this message
        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
            match res {
                Ok(()) => {}
                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
                Err(je) if je.is_panic() => { /* logged already */ }
                Err(je) => warn!("unexpected JoinError: {je:?}"),
            }
        }

        // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
        // them to continue to do work during their shutdown methods, e.g. flushing data.
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

        // shutdown all tenant and timeline tasks: gc, compaction, page service
        // No new tasks will be started for this tenant because it's in `Stopping` state.
        //
        // this will additionally shutdown and await all timeline tasks.
        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;

        if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
            walredo_mgr.shutdown().await;
        }

        // Wait for any in-flight operations to complete
        self.gate.close().await;

        remove_tenant_metrics(&self.tenant_shard_id);

        Ok(())
    }

    /// Change tenant status to Stopping, to mark that it is being shut down.
    ///
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
    ///
    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
    async fn set_stopping(
        &self,
        progress: completion::Barrier,
        _allow_transition_from_loading: bool,
        allow_transition_from_attaching: bool,
    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
            TenantState::Attaching if allow_transition_from_attaching => true,
            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
            TenantState::Activating(_) => {
                unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
            }
            TenantState::Attaching => {
                if !allow_transition_from_attaching {
                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
                };
                *current_state = TenantState::Stopping { progress };
                true
            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
                *current_state = TenantState::Stopping { progress };
                // Continue stopping outside the closure. We need to grab timelines.lock()
                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
                true
            }
            TenantState::Broken { reason, .. } => {
                info!(
                    "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
                );
                err = Some(SetStoppingError::Broken);
                false
            }
            TenantState::Stopping { progress } => {
                info!("Tenant is already in Stopping state");
                err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
                false
            }
        });
        match (stopping, err) {
            (true, None) => {} // continue
            (false, Some(err)) => return Err(err),
            (true, Some(_)) => unreachable!(
                "send_if_modified closure must error out if not transitioning to Stopping"
            ),
            (false, None) => unreachable!(
                "send_if_modified closure must return true if transitioning to Stopping"
            ),
        }

        let timelines_accessor = self.timelines.lock().unwrap();
        let not_broken_timelines = timelines_accessor
            .values()
            .filter(|timeline| !timeline.is_broken());
        for timeline in not_broken_timelines {
            timeline.set_state(TimelineState::Stopping);
        }
        Ok(())
    }

    /// Method for tenant::mgr to transition us into Broken state in case of a late failure in
    /// `remove_tenant_from_memory`
    ///
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// In tests, we also use this to set tenants to Broken state on purpose.
    pub(crate) async fn set_broken(&self, reason: String) {
        let mut rx = self.state.subscribe();

        // The load & attach routines own the tenant state until it has reached `Active`.
        // So, wait until it's done.
        rx.wait_for(|state| match state {
            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
        self.set_broken_no_wait(reason)
    }

    pub(crate) fn set_broken_no_wait(&self, reason: impl Display) {
        let reason = reason.to_string();
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Activating(_) | TenantState::Attaching => {
                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
                }
                TenantState::Active => {
                    if cfg!(feature = "testing") {
                        warn!("Changing Active tenant to Broken state, reason: {}", reason);
                        *current_state = TenantState::broken_from_reason(reason);
                    } else {
                        unreachable!("not allowed to call set_broken on Active tenants in non-testing builds")
                    }
                }
                TenantState::Broken { .. } => {
                    warn!("Tenant is already in Broken state");
                }
                // This is the only "expected" path, any other path is a bug.
                TenantState::Stopping { .. } => {
                    warn!(
                        "Marking Stopping tenant as Broken state, reason: {}",
                        reason
                    );
                    *current_state = TenantState::broken_from_reason(reason);
                }
           }
        });
    }

    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TenantState> {
        self.state.subscribe()
    }

    /// The activate_now semaphore is initialized with zero units.  As soon as
    /// we add a unit, waiters will be able to acquire a unit and proceed.
    pub(crate) fn activate_now(&self) {
        self.activate_now_sem.add_permits(1);
    }

    pub(crate) async fn wait_to_become_active(
        &self,
        timeout: Duration,
    ) -> Result<(), GetActiveTenantError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
            match current_state {
                TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    self.activate_now();
                    match timeout_cancellable(timeout, &self.cancel, receiver.changed()).await {
                        Ok(r) => {
                            r.map_err(
                            |_e: tokio::sync::watch::error::RecvError|
                                // Tenant existed but was dropped: report it as non-existent
                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
                        )?
                        }
                        Err(TimeoutCancellableError::Cancelled) => {
                            return Err(GetActiveTenantError::Cancelled);
                        }
                        Err(TimeoutCancellableError::Timeout) => {
                            return Err(GetActiveTenantError::WaitForActiveTimeout {
                                latest_state: Some(self.current_state()),
                                wait_time: timeout,
                            });
                        }
                    }
                }
                TenantState::Active { .. } => {
                    return Ok(());
                }
                TenantState::Broken { reason, .. } => {
                    // This is fatal, and reported distinctly from the general case of "will never be active" because
                    // it's logically a 500 to external API users (broken is always a bug).
                    return Err(GetActiveTenantError::Broken(reason));
                }
                TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
            }
        }
    }

    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
        self.tenant_conf.load().location.attach_mode
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
    /// rare external API calls, like a reconciliation at startup.
    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
        let conf = self.tenant_conf.load();

        let location_config_mode = match conf.location.attach_mode {
            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
            AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti,
            AttachmentMode::Stale => models::LocationConfigMode::AttachedStale,
        };

        // We have a pageserver TenantConf, we need the API-facing TenantConfig.
        let tenant_config: models::TenantConfig = conf.tenant_conf.clone().into();

        models::LocationConfig {
            mode: location_config_mode,
            generation: self.generation.into(),
            secondary_conf: None,
            shard_number: self.shard_identity.number.0,
            shard_count: self.shard_identity.count.literal(),
            shard_stripe_size: self.shard_identity.stripe_size.0,
            tenant_conf: tenant_config,
        }
    }

    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }

    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
        self.shard_identity.stripe_size
    }

    pub(crate) fn get_generation(&self) -> Generation {
        self.generation
    }

    /// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
    /// and can leave the tenant in a bad state if it fails.  The caller is responsible for
    /// resetting this tenant to a valid state if we fail.
    pub(crate) async fn split_prepare(
        &self,
        child_shards: &Vec<TenantShardId>,
    ) -> anyhow::Result<()> {
        let (timelines, offloaded) = {
            let timelines = self.timelines.lock().unwrap();
            let offloaded = self.timelines_offloaded.lock().unwrap();
            (timelines.clone(), offloaded.clone())
        };
        let timelines_iter = timelines
            .values()
            .map(TimelineOrOffloadedArcRef::<'_>::from)
            .chain(
                offloaded
                    .values()
                    .map(TimelineOrOffloadedArcRef::<'_>::from),
            );
        for timeline in timelines_iter {
            // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
            // to ensure that they do not start a split if currently in the process of doing these.

            let timeline_id = timeline.timeline_id();

            if let TimelineOrOffloadedArcRef::Timeline(timeline) = timeline {
                // Upload an index from the parent: this is partly to provide freshness for the
                // child tenants that will copy it, and partly for general ease-of-debugging: there will
                // always be a parent shard index in the same generation as we wrote the child shard index.
                tracing::info!(%timeline_id, "Uploading index");
                timeline
                    .remote_client
                    .schedule_index_upload_for_file_changes()?;
                timeline.remote_client.wait_completion().await?;
            }

            let remote_client = match timeline {
                TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.remote_client.clone(),
                TimelineOrOffloadedArcRef::Offloaded(offloaded) => {
                    let remote_client = self
                        .build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
                    Arc::new(remote_client)
                }
            };

            // Shut down the timeline's remote client: this means that the indices we write
            // for child shards will not be invalidated by the parent shard deleting layers.
            tracing::info!(%timeline_id, "Shutting down remote storage client");
            remote_client.shutdown().await;

            // Download methods can still be used after shutdown, as they don't flow through the remote client's
            // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
            // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
            // we use here really is the remotely persistent one).
            tracing::info!(%timeline_id, "Downloading index_part from parent");
            let result = remote_client
                .download_index_file(&self.cancel)
                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))
                .await?;
            let index_part = match result {
                MaybeDeletedIndexPart::Deleted(_) => {
                    anyhow::bail!("Timeline deletion happened concurrently with split")
                }
                MaybeDeletedIndexPart::IndexPart(p) => p,
            };

            for child_shard in child_shards {
                tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index());
                upload_index_part(
                    &self.remote_storage,
                    child_shard,
                    &timeline_id,
                    self.generation,
                    &index_part,
                    &self.cancel,
                )
                .await?;
            }
        }

        let tenant_manifest = self.build_tenant_manifest();
        for child_shard in child_shards {
            tracing::info!(
                "Uploading tenant manifest for child {}",
                child_shard.to_index()
            );
            upload_tenant_manifest(
                &self.remote_storage,
                child_shard,
                self.generation,
                &tenant_manifest,
                &self.cancel,
            )
            .await?;
        }

        Ok(())
    }

    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
        let mut result = TopTenantShardItem {
            id: self.tenant_shard_id,
            resident_size: 0,
            physical_size: 0,
            max_logical_size: 0,
        };

        for timeline in self.timelines.lock().unwrap().values() {
            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();

            result.physical_size += timeline
                .remote_client
                .metrics
                .remote_physical_size_gauge
                .get();
            result.max_logical_size = std::cmp::max(
                result.max_logical_size,
                timeline.metrics.current_logical_size_gauge.get(),
            );
        }

        result
    }
}

/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
/// perform a topological sort, so that the parent of each timeline comes
/// before the children.
/// E extracts the ancestor from T
/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
fn tree_sort_timelines<T, E>(
    timelines: HashMap<TimelineId, T>,
    extractor: E,
) -> anyhow::Result<Vec<(TimelineId, T)>>
where
    E: Fn(&T) -> Option<TimelineId>,
{
    let mut result = Vec::with_capacity(timelines.len());

    let mut now = Vec::with_capacity(timelines.len());
    // (ancestor, children)
    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
        HashMap::with_capacity(timelines.len());

    for (timeline_id, value) in timelines {
        if let Some(ancestor_id) = extractor(&value) {
            let children = later.entry(ancestor_id).or_default();
            children.push((timeline_id, value));
        } else {
            now.push((timeline_id, value));
        }
    }

    while let Some((timeline_id, metadata)) = now.pop() {
        result.push((timeline_id, metadata));
        // All children of this can be loaded now
        if let Some(mut children) = later.remove(&timeline_id) {
            now.append(&mut children);
        }
    }

    // All timelines should be visited now. Unless there were timelines with missing ancestors.
    if !later.is_empty() {
        for (missing_id, orphan_ids) in later {
            for (orphan_id, _) in orphan_ids {
                error!("could not load timeline {orphan_id} because its ancestor timeline {missing_id} could not be loaded");
            }
        }
        bail!("could not load tenant because some timelines are missing ancestors");
    }

    Ok(result)
}

impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.load().tenant_conf.clone()
    }

    pub fn effective_config(&self) -> TenantConf {
        self.tenant_specific_overrides()
            .merge(self.conf.default_tenant_conf.clone())
    }

    pub fn get_checkpoint_distance(&self) -> u64 {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    pub fn get_checkpoint_timeout(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    pub fn get_compaction_target_size(&self) -> u64 {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    pub fn get_compaction_period(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }

    pub fn get_compaction_threshold(&self) -> usize {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    pub fn get_gc_horizon(&self) -> u64 {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }

    pub fn get_gc_period(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

    pub fn get_image_creation_threshold(&self) -> usize {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    pub fn get_pitr_interval(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

    pub fn get_min_resident_size_override(&self) -> Option<u64> {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

    pub fn get_heatmap_period(&self) -> Option<Duration> {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        let heatmap_period = tenant_conf
            .heatmap_period
            .unwrap_or(self.conf.default_tenant_conf.heatmap_period);
        if heatmap_period.is_zero() {
            None
        } else {
            Some(heatmap_period)
        }
    }

    pub fn get_lsn_lease_length(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
            .lsn_lease_length
            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
    }

    /// Generate an up-to-date TenantManifest based on the state of this Tenant.
    fn build_tenant_manifest(&self) -> TenantManifest {
        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();

        let mut timeline_manifests = timelines_offloaded
            .iter()
            .map(|(_timeline_id, offloaded)| offloaded.manifest())
            .collect::<Vec<_>>();
        // Sort the manifests so that our output is deterministic
        timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);

        TenantManifest {
            version: LATEST_TENANT_MANIFEST_VERSION,
            offloaded_timelines: timeline_manifests,
        }
    }

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        // Use read-copy-update in order to avoid overwriting the location config
        // state if this races with [`Tenant::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
        self.tenant_conf.rcu(|inner| {
            Arc::new(AttachedTenantConf {
                tenant_conf: new_tenant_conf.clone(),
                location: inner.location,
                // Attached location is not changed, no need to update lsn lease deadline.
                lsn_lease_deadline: inner.lsn_lease_deadline,
            })
        });

        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }

    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
        let new_tenant_conf = new_conf.tenant_conf.clone();

        self.tenant_conf.store(Arc::new(new_conf));

        self.tenant_conf_updated(&new_tenant_conf);
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
        let timelines = self.list_timelines();
        for timeline in timelines {
            timeline.tenant_conf_updated(&new_tenant_conf);
        }
    }

    fn get_timeline_get_throttle_config(
        psconf: &'static PageServerConf,
        overrides: &TenantConfOpt,
    ) -> throttle::Config {
        overrides
            .timeline_get_throttle
            .clone()
            .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone())
    }

    pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
        self.timeline_get_throttle.reconfigure(conf)
    }

    /// Helper function to create a new Timeline struct.
    ///
    /// The returned Timeline is in Loading state. The caller is responsible for
    /// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
    /// map.
    ///
    /// `validate_ancestor == false` is used when a timeline is created for deletion
    /// and we might not have the ancestor present anymore which is fine for to be
    /// deleted timelines.
    fn create_timeline_struct(
        &self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        resources: TimelineResources,
        cause: CreateTimelineCause,
        create_idempotency: CreateTimelineIdempotency,
    ) -> anyhow::Result<Arc<Timeline>> {
        let state = match cause {
            CreateTimelineCause::Load => {
                let ancestor_id = new_metadata.ancestor_timeline();
                anyhow::ensure!(
                    ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
                    "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
                );
                TimelineState::Loading
            }
            CreateTimelineCause::Delete => TimelineState::Stopping,
        };

        let pg_version = new_metadata.pg_version();

        let timeline = Timeline::new(
            self.conf,
            Arc::clone(&self.tenant_conf),
            new_metadata,
            ancestor,
            new_timeline_id,
            self.tenant_shard_id,
            self.generation,
            self.shard_identity,
            self.walredo_mgr.clone(),
            resources,
            pg_version,
            state,
            self.attach_wal_lag_cooldown.clone(),
            create_idempotency,
            self.cancel.child_token(),
        );

        Ok(timeline)
    }

    // Allow too_many_arguments because a constructor's argument list naturally grows with the
    // number of attributes in the struct: breaking these out into a builder wouldn't be helpful.
    #[allow(clippy::too_many_arguments)]
    fn new(
        state: TenantState,
        conf: &'static PageServerConf,
        attached_conf: AttachedTenantConf,
        shard_identity: ShardIdentity,
        walredo_mgr: Option<Arc<WalRedoManager>>,
        tenant_shard_id: TenantShardId,
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
    ) -> Tenant {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
        );

        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
            // reflect tenant state in metrics:
            // - global per tenant state: TENANT_STATE_METRIC
            // - "set" of broken tenants: BROKEN_TENANTS_SET
            //
            // set of broken tenants should not have zero counts so that it remains accessible for
            // alerting.

            let tid = tenant_shard_id.to_string();
            let shard_id = tenant_shard_id.shard_slug().to_string();
            let set_key = &[tid.as_str(), shard_id.as_str()][..];

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
            }

            let mut tuple = inspect_state(&rx.borrow_and_update());

            let is_broken = tuple.1;
            let mut counted_broken = if is_broken {
                // add the id to the set right away, there should not be any updates on the channel
                // after before tenant is removed, if ever
                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
                true
            } else {
                false
            };

            loop {
                let labels = &tuple.0;
                let current = TENANT_STATE_METRIC.with_label_values(labels);
                current.inc();

                if rx.changed().await.is_err() {
                    // tenant has been dropped
                    current.dec();
                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                    break;
                }

                current.dec();
                tuple = inspect_state(&rx.borrow_and_update());

                let is_broken = tuple.1;
                if is_broken && !counted_broken {
                    counted_broken = true;
                    // insert the tenant_id (back) into the set while avoiding needless counter
                    // access
                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
                }
            }
        });

        Tenant {
            tenant_shard_id,
            shard_identity,
            generation: attached_conf.location.generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
            constructed_at: Instant::now(),
            timelines: Mutex::new(HashMap::new()),
            timelines_creating: Mutex::new(HashSet::new()),
            timelines_offloaded: Mutex::new(HashMap::new()),
            tenant_manifest_upload: Default::default(),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
            deletion_queue_client,
            state,
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            compaction_circuit_breaker: std::sync::Mutex::new(CircuitBreaker::new(
                format!("compaction-{tenant_shard_id}"),
                5,
                // Compaction can be a very expensive operation, and might leak disk space.  It also ought
                // to be infallible, as long as remote storage is available.  So if it repeatedly fails,
                // use an extremely long backoff.
                Some(Duration::from_secs(3600 * 24)),
            )),
            activate_now_sem: tokio::sync::Semaphore::new(0),
            attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            timeline_get_throttle: Arc::new(throttle::Throttle::new(
                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
            )),
            tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
            l0_flush_global_state,
        }
    }

    /// Locate and load config
    pub(super) fn load_tenant_config(
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
    ) -> Result<LocationConf, LoadConfigError> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);

        info!("loading tenant configuration from {config_path}");

        // load and parse file
        let config = fs::read_to_string(&config_path).map_err(|e| {
            match e.kind() {
                std::io::ErrorKind::NotFound => {
                    // The config should almost always exist for a tenant directory:
                    //  - When attaching a tenant, the config is the first thing we write
                    //  - When detaching a tenant, we atomically move the directory to a tmp location
                    //    before deleting contents.
                    //
                    // The very rare edge case that can result in a missing config is if we crash during attach
                    // between creating directory and writing config.  Callers should handle that as if the
                    // directory didn't exist.

                    LoadConfigError::NotFound(config_path)
                }
                _ => {
                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
                    // that we cannot cleanly recover
                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
                }
            }
        })?;

        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
    }

    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(super) async fn persist_tenant_config(
        conf: &'static PageServerConf,
        tenant_shard_id: &TenantShardId,
        location_conf: &LocationConf,
    ) -> std::io::Result<()> {
        let config_path = conf.tenant_location_config_path(tenant_shard_id);

        Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
    }

    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(super) async fn persist_tenant_config_at(
        tenant_shard_id: &TenantShardId,
        config_path: &Utf8Path,
        location_conf: &LocationConf,
    ) -> std::io::Result<()> {
        debug!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
#  It is read in case of pageserver restart.
"#
        .to_string();

        fail::fail_point!("tenant-config-before-write", |_| {
            Err(std::io::Error::new(
                std::io::ErrorKind::Other,
                "tenant-config-before-write",
            ))
        });

        // Convert the config to a toml file.
        conf_content +=
            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");

        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);

        let conf_content = conf_content.into_bytes();
        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
    }

    //
    // How garbage collection works:
    //
    //                    +--bar------------->
    //                   /
    //             +----+-----foo---------------->
    //            /
    // ----main--+-------------------------->
    //                \
    //                 +-----baz-------->
    //
    //
    // 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
    //    `gc_infos` are being refreshed
    // 2. Scan collected timelines, and on each timeline, make note of the
    //    all the points where other timelines have been branched off.
    //    We will refrain from removing page versions at those LSNs.
    // 3. For each timeline, scan all layer files on the timeline.
    //    Remove all files for which a newer file exists and which
    //    don't cover any branch point LSNs.
    //
    // TODO:
    // - if a relation has a non-incremental persistent layer on a child branch, then we
    //   don't need to keep that in the parent anymore. But currently
    //   we do.
    async fn gc_iteration_internal(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<GcResult, GcError> {
        let mut totals: GcResult = Default::default();
        let now = Instant::now();

        let gc_timelines = self
            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await?;

        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");

        // If there is nothing to GC, we don't want any messages in the INFO log.
        if !gc_timelines.is_empty() {
            info!("{} timelines need GC", gc_timelines.len());
        } else {
            debug!("{} timelines need GC", gc_timelines.len());
        }

        // Perform GC for each timeline.
        //
        // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
        // branch creation task, which requires the GC lock. A GC iteration can run concurrently
        // with branch creation.
        //
        // See comments in [`Tenant::branch_timeline`] for more information about why branch
        // creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
            if cancel.is_cancelled() {
                // We were requested to shut down. Stop and return with the progress we
                // made.
                break;
            }
            let result = match timeline.gc().await {
                Err(GcError::TimelineCancelled) => {
                    if target_timeline_id.is_some() {
                        // If we were targetting this specific timeline, surface cancellation to caller
                        return Err(GcError::TimelineCancelled);
                    } else {
                        // A timeline may be shutting down independently of the tenant's lifecycle: we should
                        // skip past this and proceed to try GC on other timelines.
                        continue;
                    }
                }
                r => r?,
            };
            totals += result;
        }

        totals.elapsed = now.elapsed();
        Ok(totals)
    }

    /// Refreshes the Timeline::gc_info for all timelines, returning the
    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
    /// [`Tenant::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
    pub(crate) async fn refresh_gc_info(
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<Vec<Arc<Timeline>>, GcError> {
        // since this method can now be called at different rates than the configured gc loop, it
        // might be that these configuration values get applied faster than what it was previously,
        // since these were only read from the gc task.
        let horizon = self.get_gc_horizon();
        let pitr = self.get_pitr_interval();

        // refresh all timelines
        let target_timeline_id = None;

        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, cancel, ctx)
            .await
    }

    /// Populate all Timelines' `GcInfo` with information about their children.  We do not set the
    /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
    ///
    /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`].
    fn initialize_gc_info(
        &self,
        timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
        timelines_offloaded: &std::sync::MutexGuard<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
    ) {
        // This function must be called before activation: after activation timeline create/delete operations
        // might happen, and this function is not safe to run concurrently with those.
        assert!(!self.is_active());

        // Scan all timelines. For each timeline, remember the timeline ID and
        // the branch point where it was created.
        let mut all_branchpoints: BTreeMap<TimelineId, Vec<(Lsn, TimelineId, MaybeOffloaded)>> =
            BTreeMap::new();
        timelines.iter().for_each(|(timeline_id, timeline_entry)| {
            if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
                ancestor_children.push((
                    timeline_entry.get_ancestor_lsn(),
                    *timeline_id,
                    MaybeOffloaded::No,
                ));
            }
        });
        timelines_offloaded
            .iter()
            .for_each(|(timeline_id, timeline_entry)| {
                let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else {
                    return;
                };
                let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else {
                    return;
                };
                let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default();
                ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes));
            });

        // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines
        let horizon = self.get_gc_horizon();

        // Populate each timeline's GcInfo with information about its child branches
        for timeline in timelines.values() {
            let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints
                .remove(&timeline.timeline_id)
                .unwrap_or_default();

            branchpoints.sort_by_key(|b| b.0);

            let mut target = timeline.gc_info.write().unwrap();

            target.retain_lsns = branchpoints;

            let space_cutoff = timeline
                .get_last_record_lsn()
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

            target.cutoffs = GcCutoffs {
                space: space_cutoff,
                time: Lsn::INVALID,
            };
        }
    }

    async fn refresh_gc_info_internal(
        &self,
        target_timeline_id: Option<TimelineId>,
        horizon: u64,
        pitr: Duration,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<Vec<Arc<Timeline>>, GcError> {
        // before taking the gc_cs lock, do the heavier weight finding of gc_cutoff points for
        // currently visible timelines.
        let timelines = self
            .timelines
            .lock()
            .unwrap()
            .values()
            .filter(|tl| match target_timeline_id.as_ref() {
                Some(target) => &tl.timeline_id == target,
                None => true,
            })
            .cloned()
            .collect::<Vec<_>>();

        if target_timeline_id.is_some() && timelines.is_empty() {
            // We were to act on a particular timeline and it wasn't found
            return Err(GcError::TimelineNotFound);
        }

        let mut gc_cutoffs: HashMap<TimelineId, GcCutoffs> =
            HashMap::with_capacity(timelines.len());

        for timeline in timelines.iter() {
            let cutoff = timeline
                .get_last_record_lsn()
                .checked_sub(horizon)
                .unwrap_or(Lsn(0));

            let cutoffs = timeline.find_gc_cutoffs(cutoff, pitr, cancel, ctx).await?;
            let old = gc_cutoffs.insert(timeline.timeline_id, cutoffs);
            assert!(old.is_none());
        }

        if !self.is_active() || self.cancel.is_cancelled() {
            return Err(GcError::TenantCancelled);
        }

        // grab mutex to prevent new timelines from being created here; avoid doing long operations
        // because that will stall branch creation.
        let gc_cs = self.gc_cs.lock().await;

        // Ok, we now know all the branch points.
        // Update the GC information for each timeline.
        let mut gc_timelines = Vec::with_capacity(timelines.len());
        for timeline in timelines {
            // We filtered the timeline list above
            if let Some(target_timeline_id) = target_timeline_id {
                assert_eq!(target_timeline_id, timeline.timeline_id);
            }

            {
                let mut target = timeline.gc_info.write().unwrap();

                // Cull any expired leases
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));

                timeline
                    .metrics
                    .valid_lsn_lease_count_gauge
                    .set(target.leases.len() as u64);

                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
                    }
                }

                // Update metrics that depend on GC state
                timeline
                    .metrics
                    .archival_size
                    .set(if target.within_ancestor_pitr {
                        timeline.metrics.current_logical_size_gauge.get()
                    } else {
                        0
                    });
                timeline.metrics.pitr_history_size.set(
                    timeline
                        .get_last_record_lsn()
                        .checked_sub(target.cutoffs.time)
                        .unwrap_or(Lsn(0))
                        .0,
                );

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
                // - lsn for timestamp search fails for this timeline repeatedly
                if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
                    target.cutoffs = cutoffs.clone();
                }
            }

            gc_timelines.push(timeline);
        }
        drop(gc_cs);
        Ok(gc_timelines)
    }

    /// A substitute for `branch_timeline` for use in unit tests.
    /// The returned timeline will have state value `Active` to make various `anyhow::ensure!()`
    /// calls pass, but, we do not actually call `.activate()` under the hood. So, none of the
    /// timeline background tasks are launched, except the flush loop.
    #[cfg(test)]
    async fn branch_timeline_test(
        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let tl = self
            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, ctx)
            .await?
            .into_timeline_for_test();
        tl.set_state(TimelineState::Active);
        Ok(tl)
    }

    /// Helper for unit tests to branch a timeline with some pre-loaded states.
    #[cfg(test)]
    #[allow(clippy::too_many_arguments)]
    pub async fn branch_timeline_test_with_layers(
        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        ancestor_lsn: Option<Lsn>,
        ctx: &RequestContext,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
        image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
        end_lsn: Lsn,
    ) -> anyhow::Result<Arc<Timeline>> {
        use checks::check_valid_layermap;
        use itertools::Itertools;

        let tline = self
            .branch_timeline_test(src_timeline, dst_id, ancestor_lsn, ctx)
            .await?;
        let ancestor_lsn = if let Some(ancestor_lsn) = ancestor_lsn {
            ancestor_lsn
        } else {
            tline.get_last_record_lsn()
        };
        assert!(end_lsn >= ancestor_lsn);
        tline.force_advance_lsn(end_lsn);
        for deltas in delta_layer_desc {
            tline
                .force_create_delta_layer(deltas, Some(ancestor_lsn), ctx)
                .await?;
        }
        for (lsn, images) in image_layer_desc {
            tline
                .force_create_image_layer(lsn, images, Some(ancestor_lsn), ctx)
                .await?;
        }
        let layer_names = tline
            .layers
            .read()
            .await
            .layer_map()
            .unwrap()
            .iter_historic_layers()
            .map(|layer| layer.layer_name())
            .collect_vec();
        if let Some(err) = check_valid_layermap(&layer_names) {
            bail!("invalid layermap: {err}");
        }
        Ok(tline)
    }

    /// Branch an existing timeline.
    async fn branch_timeline(
        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
            .await
    }

    async fn branch_timeline_impl(
        self: &Arc<Self>,
        src_timeline: &Arc<Timeline>,
        dst_id: TimelineId,
        start_lsn: Option<Lsn>,
        _ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

        // We will validate our ancestor LSN in this function.  Acquire the GC lock so that
        // this check cannot race with GC, and the ancestor LSN is guaranteed to remain
        // valid while we are creating the branch.
        let _gc_cs = self.gc_cs.lock().await;

        // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
        let start_lsn = start_lsn.unwrap_or_else(|| {
            let lsn = src_timeline.get_last_record_lsn();
            info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
            lsn
        });

        // we finally have determined the ancestor_start_lsn, so we can get claim exclusivity now
        let timeline_create_guard = match self
            .start_creating_timeline(
                dst_id,
                CreateTimelineIdempotency::Branch {
                    ancestor_timeline_id: src_timeline.timeline_id,
                    ancestor_start_lsn: start_lsn,
                },
            )
            .await?
        {
            StartCreatingTimelineResult::CreateGuard(guard) => guard,
            StartCreatingTimelineResult::Idempotent(timeline) => {
                return Ok(CreateTimelineResult::Idempotent(timeline));
            }
        };

        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
        // horizon on the source timeline
        //
        // We check it against both the planned GC cutoff stored in 'gc_info',
        // and the 'latest_gc_cutoff' of the last GC that was performed.  The
        // planned GC cutoff in 'gc_info' is normally larger than
        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
        // changed the GC settings for the tenant to make the PITR window
        // larger, but some of the data was already removed by an earlier GC
        // iteration.

        // check against last actual 'latest_gc_cutoff' first
        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
        src_timeline
            .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
            .context(format!(
                "invalid branch start lsn: less than latest GC cutoff {}",
                *latest_gc_cutoff_lsn,
            ))
            .map_err(CreateTimelineError::AncestorLsn)?;

        // and then the planned GC cutoff
        {
            let gc_info = src_timeline.gc_info.read().unwrap();
            let cutoff = gc_info.min_cutoff();
            if start_lsn < cutoff {
                return Err(CreateTimelineError::AncestorLsn(anyhow::anyhow!(
                    "invalid branch start lsn: less than planned GC cutoff {cutoff}"
                )));
            }
        }

        //
        // The branch point is valid, and we are still holding the 'gc_cs' lock
        // so that GC cannot advance the GC cutoff until we are finished.
        // Proceed with the branch creation.
        //

        // Determine prev-LSN for the new timeline. We can only determine it if
        // the timeline was branched at the current end of the source timeline.
        let RecordLsn {
            last: src_last,
            prev: src_prev,
        } = src_timeline.get_last_record_rlsn();
        let dst_prev = if src_last == start_lsn {
            Some(src_prev)
        } else {
            None
        };

        // Create the metadata file, noting the ancestor of the new timeline.
        // There is initially no data in it, but all the read-calls know to look
        // into the ancestor.
        let metadata = TimelineMetadata::new(
            start_lsn,
            dst_prev,
            Some(src_id),
            start_lsn,
            *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
            src_timeline.initdb_lsn,
            src_timeline.pg_version,
        );

        let uninitialized_timeline = self
            .prepare_new_timeline(
                dst_id,
                &metadata,
                timeline_create_guard,
                start_lsn + 1,
                Some(Arc::clone(src_timeline)),
            )
            .await?;

        let new_timeline = uninitialized_timeline.finish_creation()?;

        // Root timeline gets its layers during creation and uploads them along with the metadata.
        // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
        // We still need to upload its metadata eagerly: if other nodes `attach` the tenant and miss this timeline, their GC
        // could get incorrect information and remove more layers, than needed.
        // See also https://github.com/neondatabase/neon/issues/3865
        new_timeline
            .remote_client
            .schedule_index_upload_for_full_metadata_update(&metadata)
            .context("branch initial metadata upload")?;

        // Callers are responsible to wait for uploads to complete and for activating the timeline.

        Ok(CreateTimelineResult::Created(new_timeline))
    }

    /// For unit tests, make this visible so that other modules can directly create timelines
    #[cfg(test)]
    #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
    pub(crate) async fn bootstrap_timeline_test(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        self.bootstrap_timeline(timeline_id, pg_version, load_existing_initdb, ctx)
            .await
            .map_err(anyhow::Error::new)
            .map(|r| r.into_timeline_for_test())
    }

    /// Get exclusive access to the timeline ID for creation.
    ///
    /// Timeline-creating code paths must use this function before making changes
    /// to in-memory or persistent state.
    ///
    /// The `state` parameter is a description of the timeline creation operation
    /// we intend to perform.
    /// If the timeline was already created in the meantime, we check whether this
    /// request conflicts or is idempotent , based on `state`.
    async fn start_creating_timeline(
        &self,
        new_timeline_id: TimelineId,
        idempotency: CreateTimelineIdempotency,
    ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
        let allow_offloaded = false;
        match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
            Ok(create_guard) => {
                pausable_failpoint!("timeline-creation-after-uninit");
                Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
            }
            Err(TimelineExclusionError::AlreadyCreating) => {
                // Creation is in progress, we cannot create it again, and we cannot
                // check if this request matches the existing one, so caller must try
                // again later.
                Err(CreateTimelineError::AlreadyCreating)
            }
            Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
            Err(TimelineExclusionError::AlreadyExists {
                existing: TimelineOrOffloaded::Offloaded(_existing),
                ..
            }) => {
                info!("timeline already exists but is offloaded");
                Err(CreateTimelineError::Conflict)
            }
            Err(TimelineExclusionError::AlreadyExists {
                existing: TimelineOrOffloaded::Timeline(existing),
                arg,
            }) => {
                {
                    let existing = &existing.create_idempotency;
                    let _span = info_span!("idempotency_check", ?existing, ?arg).entered();
                    debug!("timeline already exists");

                    match (existing, &arg) {
                        // FailWithConflict => no idempotency check
                        (CreateTimelineIdempotency::FailWithConflict, _)
                        | (_, CreateTimelineIdempotency::FailWithConflict) => {
                            warn!("timeline already exists, failing request");
                            return Err(CreateTimelineError::Conflict);
                        }
                        // Idempotent <=> CreateTimelineIdempotency is identical
                        (x, y) if x == y => {
                            info!("timeline already exists and idempotency matches, succeeding request");
                            // fallthrough
                        }
                        (_, _) => {
                            warn!("idempotency conflict, failing request");
                            return Err(CreateTimelineError::Conflict);
                        }
                    }
                }

                Ok(StartCreatingTimelineResult::Idempotent(existing))
            }
        }
    }

    async fn upload_initdb(
        &self,
        timelines_path: &Utf8PathBuf,
        pgdata_path: &Utf8PathBuf,
        timeline_id: &TimelineId,
    ) -> anyhow::Result<()> {
        let temp_path = timelines_path.join(format!(
            "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
        ));

        scopeguard::defer! {
            if let Err(e) = fs::remove_file(&temp_path) {
                error!("Failed to remove temporary initdb archive '{temp_path}': {e}");
            }
        }

        let (pgdata_zstd, tar_zst_size) = create_zst_tarball(pgdata_path, &temp_path).await?;
        const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
        if tar_zst_size > INITDB_TAR_ZST_WARN_LIMIT {
            warn!(
                "compressed {temp_path} size of {tar_zst_size} is above limit {INITDB_TAR_ZST_WARN_LIMIT}."
            );
        }

        pausable_failpoint!("before-initdb-upload");

        backoff::retry(
            || async {
                self::remote_timeline_client::upload_initdb_dir(
                    &self.remote_storage,
                    &self.tenant_shard_id.tenant_id,
                    timeline_id,
                    pgdata_zstd.try_clone().await?,
                    tar_zst_size,
                    &self.cancel,
                )
                .await
            },
            |_| false,
            3,
            u32::MAX,
            "persist_initdb_tar_zst",
            &self.cancel,
        )
        .await
        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)
    }

    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization completes, tar up the temp dir and upload it to S3.
    async fn bootstrap_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
        let timeline_create_guard = match self
            .start_creating_timeline(
                timeline_id,
                CreateTimelineIdempotency::Bootstrap { pg_version },
            )
            .await?
        {
            StartCreatingTimelineResult::CreateGuard(guard) => guard,
            StartCreatingTimelineResult::Idempotent(timeline) => {
                return Ok(CreateTimelineResult::Idempotent(timeline))
            }
        };

        // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
        // temporary directory for basebackup files for the given timeline.

        let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
        let pgdata_path = path_with_suffix_extension(
            timelines_path.join(format!("basebackup-{timeline_id}")),
            TEMP_FILE_SUFFIX,
        );

        // Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
        // we won't race with other creations or existent timelines with the same path.
        if pgdata_path.exists() {
            fs::remove_dir_all(&pgdata_path).with_context(|| {
                format!("Failed to remove already existing initdb directory: {pgdata_path}")
            })?;
        }

        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
        scopeguard::defer! {
            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
                // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
            }
        }
        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
            if existing_initdb_timeline_id != timeline_id {
                let source_path = &remote_initdb_archive_path(
                    &self.tenant_shard_id.tenant_id,
                    &existing_initdb_timeline_id,
                );
                let dest_path =
                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);

                // if this fails, it will get retried by retried control plane requests
                self.remote_storage
                    .copy_object(source_path, dest_path, &self.cancel)
                    .await
                    .context("copy initdb tar")?;
            }
            let (initdb_tar_zst_path, initdb_tar_zst) =
                self::remote_timeline_client::download_initdb_tar_zst(
                    self.conf,
                    &self.remote_storage,
                    &self.tenant_shard_id,
                    &existing_initdb_timeline_id,
                    &self.cancel,
                )
                .await
                .context("download initdb tar")?;

            scopeguard::defer! {
                if let Err(e) = fs::remove_file(&initdb_tar_zst_path) {
                    error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}");
                }
            }

            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
            extract_zst_tarball(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
        } else {
            // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel)
                .await
                .context("run initdb")?;

            // Upload the created data dir to S3
            if self.tenant_shard_id().is_shard_zero() {
                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                    .await?;
            }
        }
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();

        // Import the contents of the data directory at the initial checkpoint
        // LSN, and any WAL after that.
        // Initdb lsn will be equal to last_record_lsn which will be set after import.
        // Because we know it upfront avoid having an option or dummy zero value by passing it to the metadata.
        let new_metadata = TimelineMetadata::new(
            Lsn(0),
            None,
            None,
            Lsn(0),
            pgdata_lsn,
            pgdata_lsn,
            pg_version,
        );
        let raw_timeline = self
            .prepare_new_timeline(
                timeline_id,
                &new_metadata,
                timeline_create_guard,
                pgdata_lsn,
                None,
            )
            .await?;

        let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id;
        let unfinished_timeline = raw_timeline.raw_timeline()?;

        // Flush the new layer files to disk, before we make the timeline as available to
        // the outside world.
        //
        // Flush loop needs to be spawned in order to be able to flush.
        unfinished_timeline.maybe_spawn_flush_loop();

        import_datadir::import_timeline_from_postgres_datadir(
            unfinished_timeline,
            &pgdata_path,
            pgdata_lsn,
            ctx,
        )
        .await
        .with_context(|| {
            format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}")
        })?;

        fail::fail_point!("before-checkpoint-new-timeline", |_| {
            Err(CreateTimelineError::Other(anyhow::anyhow!(
                "failpoint before-checkpoint-new-timeline"
            )))
        });

        unfinished_timeline
            .freeze_and_flush()
            .await
            .with_context(|| {
                format!(
                    "Failed to flush after pgdatadir import for timeline {tenant_shard_id}/{timeline_id}"
                )
            })?;

        // All done!
        let timeline = raw_timeline.finish_creation()?;

        // Callers are responsible to wait for uploads to complete and for activating the timeline.

        Ok(CreateTimelineResult::Created(timeline))
    }

    fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
        RemoteTimelineClient::new(
            self.remote_storage.clone(),
            self.deletion_queue_client.clone(),
            self.conf,
            self.tenant_shard_id,
            timeline_id,
            self.generation,
        )
    }

    /// Call this before constructing a timeline, to build its required structures
    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
        TimelineResources {
            remote_client: self.build_timeline_remote_client(timeline_id),
            timeline_get_throttle: self.timeline_get_throttle.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
        }
    }

    /// Creates intermediate timeline structure and its files.
    ///
    /// An empty layer map is initialized, and new data and WAL can be imported starting
    /// at 'disk_consistent_lsn'. After any initial data has been imported, call
    /// `finish_creation` to insert the Timeline into the timelines map.
    async fn prepare_new_timeline<'a>(
        &'a self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
        create_guard: TimelineCreateGuard<'a>,
        start_lsn: Lsn,
        ancestor: Option<Arc<Timeline>>,
    ) -> anyhow::Result<UninitializedTimeline<'a>> {
        let tenant_shard_id = self.tenant_shard_id;

        let resources = self.build_timeline_resources(new_timeline_id);
        resources
            .remote_client
            .init_upload_queue_for_empty_remote(new_metadata)?;

        let timeline_struct = self
            .create_timeline_struct(
                new_timeline_id,
                new_metadata,
                ancestor,
                resources,
                CreateTimelineCause::Load,
                create_guard.idempotency.clone(),
            )
            .context("Failed to create timeline data structure")?;

        timeline_struct.init_empty_layer_map(start_lsn);

        if let Err(e) = self
            .create_timeline_files(&create_guard.timeline_path)
            .await
        {
            error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
            cleanup_timeline_directory(create_guard);
            return Err(e);
        }

        debug!(
            "Successfully created initial files for timeline {tenant_shard_id}/{new_timeline_id}"
        );

        Ok(UninitializedTimeline::new(
            self,
            new_timeline_id,
            Some((timeline_struct, create_guard)),
        ))
    }

    async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
        crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;

        fail::fail_point!("after-timeline-dir-creation", |_| {
            anyhow::bail!("failpoint after-timeline-dir-creation");
        });

        Ok(())
    }

    /// Get a guard that provides exclusive access to the timeline directory, preventing
    /// concurrent attempts to create the same timeline.
    ///
    /// The `allow_offloaded` parameter controls whether to tolerate the existence of
    /// offloaded timelines or not.
    fn create_timeline_create_guard(
        &self,
        timeline_id: TimelineId,
        idempotency: CreateTimelineIdempotency,
        allow_offloaded: bool,
    ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
        let tenant_shard_id = self.tenant_shard_id;

        let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);

        let create_guard = TimelineCreateGuard::new(
            self,
            timeline_id,
            timeline_path.clone(),
            idempotency,
            allow_offloaded,
        )?;

        // At this stage, we have got exclusive access to in-memory state for this timeline ID
        // for creation.
        // A timeline directory should never exist on disk already:
        // - a previous failed creation would have cleaned up after itself
        // - a pageserver restart would clean up timeline directories that don't have valid remote state
        //
        // Therefore it is an unexpected internal error to encounter a timeline directory already existing here,
        // this error may indicate a bug in cleanup on failed creations.
        if timeline_path.exists() {
            return Err(TimelineExclusionError::Other(anyhow::anyhow!(
                "Timeline directory already exists! This is a bug."
            )));
        }

        Ok(create_guard)
    }

    /// Gathers inputs from all of the timelines to produce a sizing model input.
    ///
    /// Future is cancellation safe. Only one calculation can be running at once per tenant.
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub async fn gather_size_inputs(
        &self,
        // `max_retention_period` overrides the cutoff that is used to calculate the size
        // (only if it is shorter than the real cutoff).
        max_retention_period: Option<u64>,
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<size::ModelInputs, size::CalculateSyntheticSizeError> {
        let logical_sizes_at_once = self
            .conf
            .concurrent_tenant_size_logical_size_queries
            .inner();

        // TODO: Having a single mutex block concurrent reads is not great for performance.
        //
        // But the only case where we need to run multiple of these at once is when we
        // request a size for a tenant manually via API, while another background calculation
        // is in progress (which is not a common case).
        //
        // See more for on the issue #2748 condenced out of the initial PR review.
        let mut shared_cache = tokio::select! {
            locked = self.cached_logical_sizes.lock() => locked,
            _ = cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
            _ = self.cancel.cancelled() => return Err(size::CalculateSyntheticSizeError::Cancelled),
        };

        size::gather_inputs(
            self,
            logical_sizes_at_once,
            max_retention_period,
            &mut shared_cache,
            cause,
            cancel,
            ctx,
        )
        .await
    }

    /// Calculate synthetic tenant size and cache the result.
    /// This is periodically called by background worker.
    /// result is cached in tenant struct
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub async fn calculate_synthetic_size(
        &self,
        cause: LogicalSizeCalculationCause,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, size::CalculateSyntheticSizeError> {
        let inputs = self.gather_size_inputs(None, cause, cancel, ctx).await?;

        let size = inputs.calculate();

        self.set_cached_synthetic_size(size);

        Ok(size)
    }

    /// Cache given synthetic size and update the metric value
    pub fn set_cached_synthetic_size(&self, size: u64) {
        self.cached_synthetic_tenant_size
            .store(size, Ordering::Relaxed);

        // Only shard zero should be calculating synthetic sizes
        debug_assert!(self.shard_identity.is_shard_zero());

        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
            .unwrap()
            .set(size);
    }

    pub fn cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }

    /// Flush any in-progress layers, schedule uploads, and wait for uploads to complete.
    ///
    /// This function can take a long time: callers should wrap it in a timeout if calling
    /// from an external API handler.
    ///
    /// Cancel-safety: cancelling this function may leave I/O running, but such I/O is
    /// still bounded by tenant/timeline shutdown.
    #[tracing::instrument(skip_all)]
    pub(crate) async fn flush_remote(&self) -> anyhow::Result<()> {
        let timelines = self.timelines.lock().unwrap().clone();

        async fn flush_timeline(_gate: GateGuard, timeline: Arc<Timeline>) -> anyhow::Result<()> {
            tracing::info!(timeline_id=%timeline.timeline_id, "Flushing...");
            timeline.freeze_and_flush().await?;
            tracing::info!(timeline_id=%timeline.timeline_id, "Waiting for uploads...");
            timeline.remote_client.wait_completion().await?;

            Ok(())
        }

        // We do not use a JoinSet for these tasks, because we don't want them to be
        // aborted when this function's future is cancelled: they should stay alive
        // holding their GateGuard until they complete, to ensure their I/Os complete
        // before Timeline shutdown completes.
        let mut results = FuturesUnordered::new();

        for (_timeline_id, timeline) in timelines {
            // Run each timeline's flush in a task holding the timeline's gate: this
            // means that if this function's future is cancelled, the Timeline shutdown
            // will still wait for any I/O in here to complete.
            let Ok(gate) = timeline.gate.enter() else {
                continue;
            };
            let jh = tokio::task::spawn(async move { flush_timeline(gate, timeline).await });
            results.push(jh);
        }

        while let Some(r) = results.next().await {
            if let Err(e) = r {
                if !e.is_cancelled() && !e.is_panic() {
                    tracing::error!("unexpected join error: {e:?}");
                }
            }
        }

        // The flushes we did above were just writes, but the Tenant might have had
        // pending deletions as well from recent compaction/gc: we want to flush those
        // as well.  This requires flushing the global delete queue.  This is cheap
        // because it's typically a no-op.
        match self.deletion_queue_client.flush_execute().await {
            Ok(_) => {}
            Err(DeletionQueueError::ShuttingDown) => {}
        }

        Ok(())
    }

    pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
        self.tenant_conf.load().tenant_conf.clone()
    }

    /// How much local storage would this tenant like to have?  It can cope with
    /// less than this (via eviction and on-demand downloads), but this function enables
    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
    ///
    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
    /// than they report here, due to layer eviction.  Tenants with many active branches may
    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
        let timelines = self.timelines.lock().unwrap();

        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
        // reflects the observation that on tenants with multiple large branches, typically only one
        // of them is used actively enough to occupy space on disk.
        timelines
            .values()
            .map(|t| t.metrics.visible_physical_size_gauge.get())
            .max()
            .unwrap_or(0)
    }

    /// Serialize and write the latest TenantManifest to remote storage.
    pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
        // Only one manifest write may be done at at time, and the contents of the manifest
        // must be loaded while holding this lock. This makes it safe to call this function
        // from anywhere without worrying about colliding updates.
        let mut guard = tokio::select! {
            g = self.tenant_manifest_upload.lock() => {
                g
            },
            _ = self.cancel.cancelled() => {
                return Err(TenantManifestError::Cancelled);
            }
        };

        let manifest = self.build_tenant_manifest();
        if Some(&manifest) == (*guard).as_ref() {
            // Optimisation: skip uploads that don't change anything.
            return Ok(());
        }

        upload_tenant_manifest(
            &self.remote_storage,
            &self.tenant_shard_id,
            self.generation,
            &manifest,
            &self.cancel,
        )
        .await
        .map_err(|e| {
            if self.cancel.is_cancelled() {
                TenantManifestError::Cancelled
            } else {
                TenantManifestError::RemoteStorage(e)
            }
        })?;

        // Store the successfully uploaded manifest, so that future callers can avoid
        // re-uploading the same thing.
        *guard = Some(manifest);

        Ok(())
    }
}

/// Create the cluster temporarily in 'initdbpath' directory inside the repository
/// to get bootstrap data for timeline initialization.
async fn run_initdb(
    conf: &'static PageServerConf,
    initdb_target_dir: &Utf8Path,
    pg_version: u32,
    cancel: &CancellationToken,
) -> Result<(), InitdbError> {
    let initdb_bin_path = conf
        .pg_bin_dir(pg_version)
        .map_err(InitdbError::Other)?
        .join("initdb");
    let initdb_lib_dir = conf.pg_lib_dir(pg_version).map_err(InitdbError::Other)?;
    info!(
        "running {} in {}, libdir: {}",
        initdb_bin_path, initdb_target_dir, initdb_lib_dir,
    );

    let _permit = INIT_DB_SEMAPHORE.acquire().await;

    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
        .args(["--pgdata", initdb_target_dir.as_ref()])
        .args(["--username", &conf.superuser])
        .args(["--encoding", "utf8"])
        .arg("--no-instructions")
        .arg("--no-sync")
        .env_clear()
        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
        .stdin(std::process::Stdio::null())
        // stdout invocation produces the same output every time, we don't need it
        .stdout(std::process::Stdio::null())
        // we would be interested in the stderr output, if there was any
        .stderr(std::process::Stdio::piped())
        .spawn()?;

    // Ideally we'd select here with the cancellation token, but the problem is that
    // we can't safely terminate initdb: it launches processes of its own, and killing
    // initdb doesn't kill them. After we return from this function, we want the target
    // directory to be able to be cleaned up.
    // See https://github.com/neondatabase/neon/issues/6385
    let initdb_output = initdb_command.wait_with_output().await?;
    if !initdb_output.status.success() {
        return Err(InitdbError::Failed(
            initdb_output.status,
            initdb_output.stderr,
        ));
    }

    // This isn't true cancellation support, see above. Still return an error to
    // excercise the cancellation code path.
    if cancel.is_cancelled() {
        return Err(InitdbError::Cancelled);
    }

    Ok(())
}

/// Dump contents of a layer file to stdout.
pub async fn dump_layerfile_from_path(
    path: &Utf8Path,
    verbose: bool,
    ctx: &RequestContext,
) -> anyhow::Result<()> {
    use std::os::unix::fs::FileExt;

    // All layer files start with a two-byte "magic" value, to identify the kind of
    // file.
    let file = File::open(path)?;
    let mut header_buf = [0u8; 2];
    file.read_exact_at(&mut header_buf, 0)?;

    match u16::from_be_bytes(header_buf) {
        crate::IMAGE_FILE_MAGIC => {
            ImageLayer::new_for_path(path, file)?
                .dump(verbose, ctx)
                .await?
        }
        crate::DELTA_FILE_MAGIC => {
            DeltaLayer::new_for_path(path, file)?
                .dump(verbose, ctx)
                .await?
        }
        magic => bail!("unrecognized magic identifier: {:?}", magic),
    }

    Ok(())
}

#[cfg(test)]
pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
    use once_cell::sync::OnceCell;
    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
    use utils::logging;

    use crate::deletion_queue::mock::MockDeletionQueue;
    use crate::l0_flush::L0FlushConfig;
    use crate::walredo::apply_neon;
    use pageserver_api::key::Key;
    use pageserver_api::record::NeonWalRecord;

    use super::*;
    use hex_literal::hex;
    use utils::id::TenantId;

    pub const TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("11223344556677881122334455667788"));
    pub const NEW_TIMELINE_ID: TimelineId =
        TimelineId::from_array(hex!("AA223344556677881122334455667788"));

    /// Convenience function to create a page image with given string as the only content
    pub fn test_img(s: &str) -> Bytes {
        let mut buf = BytesMut::new();
        buf.extend_from_slice(s.as_bytes());
        buf.resize(64, 0);

        buf.freeze()
    }

    impl From<TenantConf> for TenantConfOpt {
        fn from(tenant_conf: TenantConf) -> Self {
            Self {
                checkpoint_distance: Some(tenant_conf.checkpoint_distance),
                checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
                compaction_algorithm: Some(tenant_conf.compaction_algorithm),
                gc_horizon: Some(tenant_conf.gc_horizon),
                gc_period: Some(tenant_conf.gc_period),
                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
                pitr_interval: Some(tenant_conf.pitr_interval),
                walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
                lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
                eviction_policy: Some(tenant_conf.eviction_policy),
                min_resident_size_override: tenant_conf.min_resident_size_override,
                evictions_low_residence_duration_metric_threshold: Some(
                    tenant_conf.evictions_low_residence_duration_metric_threshold,
                ),
                heatmap_period: Some(tenant_conf.heatmap_period),
                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
                timeline_get_throttle: Some(tenant_conf.timeline_get_throttle),
                image_layer_creation_check_threshold: Some(
                    tenant_conf.image_layer_creation_check_threshold,
                ),
                lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
            }
        }
    }

    pub struct TenantHarness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
        pub tenant_shard_id: TenantShardId,
        pub generation: Generation,
        pub shard: ShardIndex,
        pub remote_storage: GenericRemoteStorage,
        pub remote_fs_dir: Utf8PathBuf,
        pub deletion_queue: MockDeletionQueue,
    }

    static LOG_HANDLE: OnceCell<()> = OnceCell::new();

    pub(crate) fn setup_logging() {
        LOG_HANDLE.get_or_init(|| {
            logging::init(
                logging::LogFormat::Test,
                // enable it in case the tests exercise code paths that use
                // debug_assert_current_span_has_tenant_and_timeline_id
                logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
                logging::Output::Stdout,
            )
            .expect("Failed to init test logging")
        });
    }

    impl TenantHarness {
        pub async fn create_custom(
            test_name: &'static str,
            tenant_conf: TenantConf,
            tenant_id: TenantId,
            shard_identity: ShardIdentity,
            generation: Generation,
        ) -> anyhow::Result<Self> {
            setup_logging();

            let repo_dir = PageServerConf::test_repo_dir(test_name);
            let _ = fs::remove_dir_all(&repo_dir);
            fs::create_dir_all(&repo_dir)?;

            let conf = PageServerConf::dummy_conf(repo_dir);
            // Make a static copy of the config. This can never be free'd, but that's
            // OK in a test.
            let conf: &'static PageServerConf = Box::leak(Box::new(conf));

            let shard = shard_identity.shard_index();
            let tenant_shard_id = TenantShardId {
                tenant_id,
                shard_number: shard.shard_number,
                shard_count: shard.shard_count,
            };
            fs::create_dir_all(conf.tenant_path(&tenant_shard_id))?;
            fs::create_dir_all(conf.timelines_path(&tenant_shard_id))?;

            use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
                storage: RemoteStorageKind::LocalFs {
                    local_path: remote_fs_dir.clone(),
                },
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
            let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));

            Ok(Self {
                conf,
                tenant_conf,
                tenant_shard_id,
                generation,
                shard,
                remote_storage,
                remote_fs_dir,
                deletion_queue,
            })
        }

        pub async fn create(test_name: &'static str) -> anyhow::Result<Self> {
            // Disable automatic GC and compaction to make the unit tests more deterministic.
            // The tests perform them manually if needed.
            let tenant_conf = TenantConf {
                gc_period: Duration::ZERO,
                compaction_period: Duration::ZERO,
                ..TenantConf::default()
            };
            let tenant_id = TenantId::generate();
            let shard = ShardIdentity::unsharded();
            Self::create_custom(
                test_name,
                tenant_conf,
                tenant_id,
                shard,
                Generation::new(0xdeadbeef),
            )
            .await
        }

        pub fn span(&self) -> tracing::Span {
            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
        }

        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
            (
                self.do_try_load(&ctx)
                    .await
                    .expect("failed to load test tenant"),
                ctx,
            )
        }

        #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
        pub(crate) async fn do_try_load(
            &self,
            ctx: &RequestContext,
        ) -> anyhow::Result<Arc<Tenant>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

            let tenant = Arc::new(Tenant::new(
                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    TenantConfOpt::from(self.tenant_conf.clone()),
                    self.generation,
                    &ShardParameters::default(),
                ))
                .unwrap(),
                // This is a legacy/test code path: sharding isn't supported here.
                ShardIdentity::unsharded(),
                Some(walredo_mgr),
                self.tenant_shard_id,
                self.remote_storage.clone(),
                self.deletion_queue.new_client(),
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
            ));

            let preload = tenant
                .preload(&self.remote_storage, CancellationToken::new())
                .await?;
            tenant.attach(Some(preload), ctx).await?;

            tenant.state.send_replace(TenantState::Active);
            for timeline in tenant.timelines.lock().unwrap().values() {
                timeline.set_state(TimelineState::Active);
            }
            Ok(tenant)
        }

        pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
            self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
        }
    }

    // Mock WAL redo manager that doesn't do much
    pub(crate) struct TestRedoManager;

    impl TestRedoManager {
        /// # Cancel-Safety
        ///
        /// This method is cancellation-safe.
        pub async fn request_redo(
            &self,
            key: Key,
            lsn: Lsn,
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
        ) -> Result<Bytes, walredo::Error> {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
            if records_neon {
                // For Neon wal records, we can decode without spawning postgres, so do so.
                let mut page = match (base_img, records.first()) {
                    (Some((_lsn, img)), _) => {
                        let mut page = BytesMut::new();
                        page.extend_from_slice(&img);
                        page
                    }
                    (_, Some((_lsn, rec))) if rec.will_init() => BytesMut::new(),
                    _ => {
                        panic!("Neon WAL redo requires base image or will init record");
                    }
                };

                for (record_lsn, record) in records {
                    apply_neon::apply_in_neon(&record, record_lsn, key, &mut page)?;
                }
                Ok(page.freeze())
            } else {
                // We never spawn a postgres walredo process in unit tests: just log what we might have done.
                let s = format!(
                    "redo for {} to get to {}, with {} and {} records",
                    key,
                    lsn,
                    if base_img.is_some() {
                        "base image"
                    } else {
                        "no base image"
                    },
                    records.len()
                );
                println!("{s}");

                Ok(test_img(&s))
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use std::collections::{BTreeMap, BTreeSet};

    use super::*;
    use crate::keyspace::KeySpaceAccum;
    use crate::tenant::harness::*;
    use crate::tenant::timeline::CompactFlags;
    use crate::DEFAULT_PG_VERSION;
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
    use itertools::Itertools;
    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use pageserver_api::value::Value;
    use pageserver_compaction::helpers::overlaps_with;
    use rand::{thread_rng, Rng};
    use storage_layer::PersistentLayerKey;
    use tests::storage_layer::ValuesReconstructState;
    use tests::timeline::{GetVectoredError, ShutdownMode};
    use timeline::DeltaLayerTestDesc;
    use utils::id::TenantId;

    #[cfg(feature = "testing")]
    use pageserver_api::record::NeonWalRecord;
    #[cfg(feature = "testing")]
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    #[cfg(feature = "testing")]
    use timeline::GcInfo;

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
                Lsn(0x10),
                &Value::Image(test_img("foo at 0x10")),
                &ctx,
            )
            .await?;
        writer.finish_write(Lsn(0x10));
        drop(writer);

        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
                Lsn(0x20),
                &Value::Image(test_img("foo at 0x20")),
                &ctx,
            )
            .await?;
        writer.finish_write(Lsn(0x20));
        drop(writer);

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
            test_img("foo at 0x20")
        );

        Ok(())
    }

    #[tokio::test]
    async fn no_duplicate_timelines() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")
            .await?
            .load()
            .await;
        let _ = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        match tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await
        {
            Ok(_) => panic!("duplicate timeline creation should fail"),
            Err(e) => assert_eq!(
                e.to_string(),
                "timeline already exists with different parameters".to_string()
            ),
        }

        Ok(())
    }

    /// Convenience function to create a page image with given string as the only content
    pub fn test_value(s: &str) -> Value {
        let mut buf = BytesMut::new();
        buf.extend_from_slice(s.as_bytes());
        Value::Image(buf.freeze())
    }

    ///
    /// Test branch creation
    ///
    #[tokio::test]
    async fn test_branch() -> anyhow::Result<()> {
        use std::str::from_utf8;

        let (tenant, ctx) = TenantHarness::create("test_branch").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let mut writer = tline.writer().await;

        #[allow(non_snake_case)]
        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
        #[allow(non_snake_case)]
        let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();

        // Insert a value on the timeline
        writer
            .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx)
            .await?;
        writer
            .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x20));

        writer
            .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x30));
        writer
            .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx)
            .await?;
        writer.finish_write(Lsn(0x40));

        //assert_current_logical_size(&tline, Lsn(0x40));

        // Branch the history, modify relation differently on the new timeline
        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
        let mut new_writer = newtline.writer().await;
        new_writer
            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
            .await?;
        new_writer.finish_write(Lsn(0x40));

        // Check page contents on both branches
        assert_eq!(
            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
            "foo at 0x40"
        );
        assert_eq!(
            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
            "bar at 0x40"
        );
        assert_eq!(
            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
            "foobar at 0x20"
        );

        //assert_current_logical_size(&tline, Lsn(0x40));

        Ok(())
    }

    async fn make_some_layers(
        tline: &Timeline,
        start_lsn: Lsn,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let mut lsn = start_lsn;
        {
            let mut writer = tline.writer().await;
            // Create a relation on the timeline
            writer
                .put(
                    *TEST_KEY,
                    lsn,
                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
            lsn += 0x10;
            writer
                .put(
                    *TEST_KEY,
                    lsn,
                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
            lsn += 0x10;
        }
        tline.freeze_and_flush().await?;
        {
            let mut writer = tline.writer().await;
            writer
                .put(
                    *TEST_KEY,
                    lsn,
                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
            lsn += 0x10;
            writer
                .put(
                    *TEST_KEY,
                    lsn,
                    &Value::Image(test_img(&format!("foo at {}", lsn))),
                    ctx,
                )
                .await?;
            writer.finish_write(lsn);
        }
        tline.freeze_and_flush().await.map_err(|e| e.into())
    }

    #[tokio::test(start_paused = true)]
    async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
        let (tenant, ctx) =
            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
                .await?
                .load()
                .await;
        // Advance to the lsn lease deadline so that GC is not blocked by
        // initial transition into AttachedSingle.
        tokio::time::advance(tenant.get_lsn_lease_length()).await;
        tokio::time::resume();
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        // FIXME: this doesn't actually remove any layer currently, given how the flushing
        // and compaction works. But it does set the 'cutoff' point so that the cross check
        // below should fail.
        tenant
            .gc_iteration(
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
                &CancellationToken::new(),
                &ctx,
            )
            .await?;

        // try to branch at lsn 25, should fail because we already garbage collected the data
        match tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
            .await
        {
            Ok(_) => panic!("branching should have failed"),
            Err(err) => {
                let CreateTimelineError::AncestorLsn(err) = err else {
                    panic!("wrong error type")
                };
                assert!(err.to_string().contains("invalid branch start lsn"));
                assert!(err
                    .source()
                    .unwrap()
                    .to_string()
                    .contains("we might've already garbage collected needed data"))
            }
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
        let (tenant, ctx) =
            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")
                .await?
                .load()
                .await;

        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)
            .await?;
        // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
        match tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
            .await
        {
            Ok(_) => panic!("branching should have failed"),
            Err(err) => {
                let CreateTimelineError::AncestorLsn(err) = err else {
                    panic!("wrong error type");
                };
                assert!(&err.to_string().contains("invalid branch start lsn"));
                assert!(&err
                    .source()
                    .unwrap()
                    .to_string()
                    .contains("is earlier than latest GC cutoff"));
            }
        }

        Ok(())
    }

    /*
    // FIXME: This currently fails to error out. Calling GC doesn't currently
    // remove the old value, we'd need to work a little harder
    #[tokio::test]
    async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
        let repo =
            RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
            .load();

        let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
        assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
        match tline.get(*TEST_KEY, Lsn(0x25)) {
            Ok(_) => panic!("request for page should have failed"),
            Err(err) => assert!(err.to_string().contains("not found at")),
        }
        Ok(())
    }
     */

    #[tokio::test]
    async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
        let (tenant, ctx) =
            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")
                .await?
                .load()
                .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

        make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

        tline.set_broken("test".to_owned());

        tenant
            .gc_iteration(
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
                &CancellationToken::new(),
                &ctx,
            )
            .await?;

        // The branchpoints should contain all timelines, even ones marked
        // as Broken.
        {
            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
            assert_eq!(branchpoints.len(), 1);
            assert_eq!(
                branchpoints[0],
                (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No)
            );
        }

        // You can read the key from the child branch even though the parent is
        // Broken, as long as you don't need to access data from the parent.
        assert_eq!(
            newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
            test_img(&format!("foo at {}", Lsn(0x70)))
        );

        // This needs to traverse to the parent, and fails.
        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
        assert!(
            err.to_string().starts_with(&format!(
                "bad state on timeline {}: Broken",
                tline.timeline_id
            )),
            "{err}"
        );

        Ok(())
    }

    #[tokio::test]
    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
        let (tenant, ctx) =
            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")
                .await?
                .load()
                .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");
        // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
        tenant
            .gc_iteration(
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
                &CancellationToken::new(),
                &ctx,
            )
            .await?;
        assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());

        Ok(())
    }
    #[tokio::test]
    async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_parent_keeps_data_forever_after_branching")
            .await?
            .load()
            .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
            .await?;
        let newtline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("Should have a local timeline");

        make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

        // run gc on parent
        tenant
            .gc_iteration(
                Some(TIMELINE_ID),
                0x10,
                Duration::ZERO,
                &CancellationToken::new(),
                &ctx,
            )
            .await?;

        // Check that the data is still accessible on the branch.
        assert_eq!(
            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
            test_img(&format!("foo at {}", Lsn(0x40)))
        );

        Ok(())
    }

    #[tokio::test]
    async fn timeline_load() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load";
        let harness = TenantHarness::create(TEST_NAME).await?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
                .await?;
            make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
                .unwrap();
        }

        let (tenant, _ctx) = harness.load().await;
        tenant
            .get_timeline(TIMELINE_ID, true)
            .expect("cannot load timeline");

        Ok(())
    }

    #[tokio::test]
    async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
        const TEST_NAME: &str = "timeline_load_with_ancestor";
        let harness = TenantHarness::create(TEST_NAME).await?;
        // create two timelines
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
                .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
                .await?;

            make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

            let child_tline = tenant
                .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
                .await?;
            child_tline.set_state(TimelineState::Active);

            let newtline = tenant
                .get_timeline(NEW_TIMELINE_ID, true)
                .expect("Should have a local timeline");

            make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;

            // so that all uploads finish & we can call harness.load() below again
            tenant
                .shutdown(Default::default(), ShutdownMode::FreezeAndFlush)
                .instrument(harness.span())
                .await
                .ok()
                .unwrap();
        }

        // check that both of them are initially unloaded
        let (tenant, _ctx) = harness.load().await;

        // check that both, child and ancestor are loaded
        let _child_tline = tenant
            .get_timeline(NEW_TIMELINE_ID, true)
            .expect("cannot get child timeline loaded");

        let _ancestor_tline = tenant
            .get_timeline(TIMELINE_ID, true)
            .expect("cannot get ancestor timeline loaded");

        Ok(())
    }

    #[tokio::test]
    async fn delta_layer_dumping() -> anyhow::Result<()> {
        use storage_layer::AsLayerDesc;
        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")
            .await?
            .load()
            .await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        let layer_map = tline.layers.read().await;
        let level0_deltas = layer_map
            .layer_map()?
            .level0_deltas()
            .iter()
            .map(|desc| layer_map.get_from_desc(desc))
            .collect::<Vec<_>>();

        assert!(!level0_deltas.is_empty());

        for delta in level0_deltas {
            // Ensure we are dumping a delta layer here
            assert!(delta.layer_desc().is_delta);
            delta.dump(true, &ctx).await.unwrap();
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_images() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_images").await?.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
                Lsn(0x10),
                &Value::Image(test_img("foo at 0x10")),
                &ctx,
            )
            .await?;
        writer.finish_write(Lsn(0x10));
        drop(writer);

        tline.freeze_and_flush().await?;
        tline
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
                Lsn(0x20),
                &Value::Image(test_img("foo at 0x20")),
                &ctx,
            )
            .await?;
        writer.finish_write(Lsn(0x20));
        drop(writer);

        tline.freeze_and_flush().await?;
        tline
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
                Lsn(0x30),
                &Value::Image(test_img("foo at 0x30")),
                &ctx,
            )
            .await?;
        writer.finish_write(Lsn(0x30));
        drop(writer);

        tline.freeze_and_flush().await?;
        tline
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
        writer
            .put(
                *TEST_KEY,
                Lsn(0x40),
                &Value::Image(test_img("foo at 0x40")),
                &ctx,
            )
            .await?;
        writer.finish_write(Lsn(0x40));
        drop(writer);

        tline.freeze_and_flush().await?;
        tline
            .compact(&CancellationToken::new(), EnumSet::empty(), &ctx)
            .await?;

        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
            test_img("foo at 0x10")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
            test_img("foo at 0x20")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
            test_img("foo at 0x30")
        );
        assert_eq!(
            tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
            test_img("foo at 0x40")
        );

        Ok(())
    }

    async fn bulk_insert_compact_gc(
        tenant: &Tenant,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        lsn: Lsn,
        repeat: usize,
        key_count: usize,
    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
        let compact = true;
        bulk_insert_maybe_compact_gc(tenant, timeline, ctx, lsn, repeat, key_count, compact).await
    }

    async fn bulk_insert_maybe_compact_gc(
        tenant: &Tenant,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
        repeat: usize,
        key_count: usize,
        compact: bool,
    ) -> anyhow::Result<HashMap<Key, BTreeSet<Lsn>>> {
        let mut inserted: HashMap<Key, BTreeSet<Lsn>> = Default::default();

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut blknum = 0;

        // Enforce that key range is monotonously increasing
        let mut keyspace = KeySpaceAccum::new();

        let cancel = CancellationToken::new();

        for _ in 0..repeat {
            for _ in 0..key_count {
                test_key.field6 = blknum;
                let mut writer = timeline.writer().await;
                writer
                    .put(
                        test_key,
                        lsn,
                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        ctx,
                    )
                    .await?;
                inserted.entry(test_key).or_default().insert(lsn);
                writer.finish_write(lsn);
                drop(writer);

                keyspace.add_key(test_key);

                lsn = Lsn(lsn.0 + 0x10);
                blknum += 1;
            }

            timeline.freeze_and_flush().await?;
            if compact {
                // this requires timeline to be &Arc<Timeline>
                timeline.compact(&cancel, EnumSet::empty(), ctx).await?;
            }

            // this doesn't really need to use the timeline_id target, but it is closer to what it
            // originally was.
            let res = tenant
                .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx)
                .await?;

            assert_eq!(res.layers_removed, 0, "this never removes anything");
        }

        Ok(inserted)
    }

    //
    // Insert 1000 key-value pairs with increasing keys, flush, compact, GC.
    // Repeat 50 times.
    //
    #[tokio::test]
    async fn test_bulk_insert() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_bulk_insert").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
        bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        Ok(())
    }

    // Test the vectored get real implementation against a simple sequential implementation.
    //
    // The test generates a keyspace by repeatedly flushing the in-memory layer and compacting.
    // Projected to 2D the key space looks like below. Lsn grows upwards on the Y axis and keys
    // grow to the right on the X axis.
    //                       [Delta]
    //                 [Delta]
    //           [Delta]
    //    [Delta]
    // ------------ Image ---------------
    //
    // After layer generation we pick the ranges to query as follows:
    // 1. The beginning of each delta layer
    // 2. At the seam between two adjacent delta layers
    //
    // There's one major downside to this test: delta layers only contains images,
    // so the search can stop at the first delta layer and doesn't traverse any deeper.
    #[tokio::test]
    async fn test_get_vectored() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_get_vectored").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
        let inserted = bulk_insert_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000).await?;

        let guard = tline.layers.read().await;
        let lm = guard.layer_map()?;

        lm.dump(true, &ctx).await?;

        let mut reads = Vec::new();
        let mut prev = None;
        lm.iter_historic_layers().for_each(|desc| {
            if !desc.is_delta() {
                prev = Some(desc.clone());
                return;
            }

            let start = desc.key_range.start;
            let end = desc
                .key_range
                .start
                .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
            reads.push(KeySpace {
                ranges: vec![start..end],
            });

            if let Some(prev) = &prev {
                if !prev.is_delta() {
                    return;
                }

                let first_range = Key {
                    field6: prev.key_range.end.field6 - 4,
                    ..prev.key_range.end
                }..prev.key_range.end;

                let second_range = desc.key_range.start..Key {
                    field6: desc.key_range.start.field6 + 4,
                    ..desc.key_range.start
                };

                reads.push(KeySpace {
                    ranges: vec![first_range, second_range],
                });
            };

            prev = Some(desc.clone());
        });

        drop(guard);

        // Pick a big LSN such that we query over all the changes.
        let reads_lsn = Lsn(u64::MAX - 1);

        for read in reads {
            info!("Doing vectored read on {:?}", read);

            let vectored_res = tline
                .get_vectored_impl(
                    read.clone(),
                    reads_lsn,
                    &mut ValuesReconstructState::new(),
                    &ctx,
                )
                .await;

            let mut expected_lsns: HashMap<Key, Lsn> = Default::default();
            let mut expect_missing = false;
            let mut key = read.start().unwrap();
            while key != read.end().unwrap() {
                if let Some(lsns) = inserted.get(&key) {
                    let expected_lsn = lsns.iter().rfind(|lsn| **lsn <= reads_lsn);
                    match expected_lsn {
                        Some(lsn) => {
                            expected_lsns.insert(key, *lsn);
                        }
                        None => {
                            expect_missing = true;
                            break;
                        }
                    }
                } else {
                    expect_missing = true;
                    break;
                }

                key = key.next();
            }

            if expect_missing {
                assert!(matches!(vectored_res, Err(GetVectoredError::MissingKey(_))));
            } else {
                for (key, image) in vectored_res? {
                    let expected_lsn = expected_lsns.get(&key).expect("determined above");
                    let expected_image = test_img(&format!("{} at {}", key.field6, expected_lsn));
                    assert_eq!(image?, expected_image);
                }
            }
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_get_vectored_aux_files").await?;

        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = tline.raw_timeline().unwrap();

        let mut modification = tline.begin_modification(Lsn(0x1000));
        modification.put_file("foo/bar1", b"content1", &ctx).await?;
        modification.set_lsn(Lsn(0x1008))?;
        modification.put_file("foo/bar2", b"content2", &ctx).await?;
        modification.commit(&ctx).await?;

        let child_timeline_id = TimelineId::generate();
        tenant
            .branch_timeline_test(
                tline,
                child_timeline_id,
                Some(tline.get_last_record_lsn()),
                &ctx,
            )
            .await?;

        let child_timeline = tenant
            .get_timeline(child_timeline_id, true)
            .expect("Should have the branched timeline");

        let aux_keyspace = KeySpace {
            ranges: vec![NON_INHERITED_RANGE],
        };
        let read_lsn = child_timeline.get_last_record_lsn();

        let vectored_res = child_timeline
            .get_vectored_impl(
                aux_keyspace.clone(),
                read_lsn,
                &mut ValuesReconstructState::new(),
                &ctx,
            )
            .await;

        let images = vectored_res?;
        assert!(images.is_empty());
        Ok(())
    }

    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
    // The test generates timelines that look like the diagram below.
    // We leave a gap in one of the L1 layers at `gap_at_key` (`/` in the diagram).
    // The reconstruct data for that key lies in the ancestor timeline (`X` in the diagram).
    //
    // ```
    //-------------------------------+
    //                          ...  |
    //               [   L1   ]      |
    //     [ / L1   ]                | Child Timeline
    // ...                           |
    // ------------------------------+
    //     [ X L1   ]                | Parent Timeline
    // ------------------------------+
    // ```
    #[tokio::test]
    async fn test_get_vectored_key_gap() -> anyhow::Result<()> {
        let tenant_conf = TenantConf {
            // Make compaction deterministic
            gc_period: Duration::ZERO,
            compaction_period: Duration::ZERO,
            // Encourage creation of L1 layers
            checkpoint_distance: 16 * 1024,
            compaction_target_size: 8 * 1024,
            ..TenantConf::default()
        };

        let harness = TenantHarness::create_custom(
            "test_get_vectored_key_gap",
            tenant_conf,
            TenantId::generate(),
            ShardIdentity::unsharded(),
            Generation::new(0xdeadbeef),
        )
        .await?;
        let (tenant, ctx) = harness.load().await;

        let mut current_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let gap_at_key = current_key.add(100);
        let mut current_lsn = Lsn(0x10);

        const KEY_COUNT: usize = 10_000;

        let timeline_id = TimelineId::generate();
        let current_timeline = tenant
            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
            .await?;

        current_lsn += 0x100;

        let mut writer = current_timeline.writer().await;
        writer
            .put(
                gap_at_key,
                current_lsn,
                &Value::Image(test_img(&format!("{} at {}", gap_at_key, current_lsn))),
                &ctx,
            )
            .await?;
        writer.finish_write(current_lsn);
        drop(writer);

        let mut latest_lsns = HashMap::new();
        latest_lsns.insert(gap_at_key, current_lsn);

        current_timeline.freeze_and_flush().await?;

        let child_timeline_id = TimelineId::generate();

        tenant
            .branch_timeline_test(
                &current_timeline,
                child_timeline_id,
                Some(current_lsn),
                &ctx,
            )
            .await?;
        let child_timeline = tenant
            .get_timeline(child_timeline_id, true)
            .expect("Should have the branched timeline");

        for i in 0..KEY_COUNT {
            if current_key == gap_at_key {
                current_key = current_key.next();
                continue;
            }

            current_lsn += 0x10;

            let mut writer = child_timeline.writer().await;
            writer
                .put(
                    current_key,
                    current_lsn,
                    &Value::Image(test_img(&format!("{} at {}", current_key, current_lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(current_lsn);
            drop(writer);

            latest_lsns.insert(current_key, current_lsn);
            current_key = current_key.next();

            // Flush every now and then to encourage layer file creation.
            if i % 500 == 0 {
                child_timeline.freeze_and_flush().await?;
            }
        }

        child_timeline.freeze_and_flush().await?;
        let mut flags = EnumSet::new();
        flags.insert(CompactFlags::ForceRepartition);
        child_timeline
            .compact(&CancellationToken::new(), flags, &ctx)
            .await?;

        let key_near_end = {
            let mut tmp = current_key;
            tmp.field6 -= 10;
            tmp
        };

        let key_near_gap = {
            let mut tmp = gap_at_key;
            tmp.field6 -= 10;
            tmp
        };

        let read = KeySpace {
            ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
        };
        let results = child_timeline
            .get_vectored_impl(
                read.clone(),
                current_lsn,
                &mut ValuesReconstructState::new(),
                &ctx,
            )
            .await?;

        for (key, img_res) in results {
            let expected = test_img(&format!("{} at {}", key, latest_lsns[&key]));
            assert_eq!(img_res?, expected);
        }

        Ok(())
    }

    // Test that vectored get descends into ancestor timelines correctly and
    // does not return an image that's newer than requested.
    //
    // The diagram below ilustrates an interesting case. We have a parent timeline
    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
    // from the child timeline, so the parent timeline must be visited. When advacing into
    // the child timeline, the read path needs to remember what the requested Lsn was in
    // order to avoid returning an image that's too new. The test below constructs such
    // a timeline setup and does a few queries around the Lsn of each page image.
    // ```
    //    LSN
    //     ^
    //     |
    //     |
    // 500 | --------------------------------------> branch point
    // 400 |        X
    // 300 |        X
    // 200 | --------------------------------------> requested lsn
    // 100 |        X
    //     |---------------------------------------> Key
    //              |
    //              ------> requested key
    //
    // Legend:
    // * X - page images
    // ```
    #[tokio::test]
    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis").await?;
        let (tenant, ctx) = harness.load().await;

        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let end_key = start_key.add(1000);
        let child_gap_at_key = start_key.add(500);
        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();

        let mut current_lsn = Lsn(0x10);

        let timeline_id = TimelineId::generate();
        let parent_timeline = tenant
            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
            .await?;

        current_lsn += 0x100;

        for _ in 0..3 {
            let mut key = start_key;
            while key < end_key {
                current_lsn += 0x10;

                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);

                let mut writer = parent_timeline.writer().await;
                writer
                    .put(
                        key,
                        current_lsn,
                        &Value::Image(test_img(&image_value)),
                        &ctx,
                    )
                    .await?;
                writer.finish_write(current_lsn);

                if key == child_gap_at_key {
                    parent_gap_lsns.insert(current_lsn, image_value);
                }

                key = key.next();
            }

            parent_timeline.freeze_and_flush().await?;
        }

        let child_timeline_id = TimelineId::generate();

        let child_timeline = tenant
            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
            .await?;

        let mut key = start_key;
        while key < end_key {
            if key == child_gap_at_key {
                key = key.next();
                continue;
            }

            current_lsn += 0x10;

            let mut writer = child_timeline.writer().await;
            writer
                .put(
                    key,
                    current_lsn,
                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(current_lsn);

            key = key.next();
        }

        child_timeline.freeze_and_flush().await?;

        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
        let mut query_lsns = Vec::new();
        for image_lsn in parent_gap_lsns.keys().rev() {
            for offset in lsn_offsets {
                query_lsns.push(Lsn(image_lsn
                    .0
                    .checked_add_signed(offset)
                    .expect("Shouldn't overflow")));
            }
        }

        for query_lsn in query_lsns {
            let results = child_timeline
                .get_vectored_impl(
                    KeySpace {
                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
                    },
                    query_lsn,
                    &mut ValuesReconstructState::new(),
                    &ctx,
                )
                .await;

            let expected_item = parent_gap_lsns
                .iter()
                .rev()
                .find(|(lsn, _)| **lsn <= query_lsn);

            info!(
                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
                query_lsn, expected_item
            );

            match expected_item {
                Some((_, img_value)) => {
                    let key_results = results.expect("No vectored get error expected");
                    let key_result = &key_results[&child_gap_at_key];
                    let returned_img = key_result
                        .as_ref()
                        .expect("No page reconstruct error expected");

                    info!(
                        "Vectored read at LSN {} returned image {}",
                        query_lsn,
                        std::str::from_utf8(returned_img)?
                    );
                    assert_eq!(*returned_img, test_img(img_value));
                }
                None => {
                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
                }
            }
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
        let names_algorithms = [
            ("test_random_updates_legacy", CompactionAlgorithm::Legacy),
            ("test_random_updates_tiered", CompactionAlgorithm::Tiered),
        ];
        for (name, algorithm) in names_algorithms {
            test_random_updates_algorithm(name, algorithm).await?;
        }
        Ok(())
    }

    async fn test_random_updates_algorithm(
        name: &'static str,
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
        let mut harness = TenantHarness::create(name).await?;
        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
            kind: compaction_algorithm,
        };
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        const NUM_KEYS: usize = 1000;
        let cancel = CancellationToken::new();

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let mut test_key_end = test_key;
        test_key_end.field6 = NUM_KEYS as u32;
        tline.add_extra_test_dense_keyspace(KeySpace::single(test_key..test_key_end));

        let mut keyspace = KeySpaceAccum::new();

        // Track when each page was last modified. Used to assert that
        // a read sees the latest page version.
        let mut updated = [Lsn(0); NUM_KEYS];

        let mut lsn = Lsn(0x10);
        #[allow(clippy::needless_range_loop)]
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
                    lsn,
                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
            updated[blknum] = lsn;
            drop(writer);

            keyspace.add_key(test_key);
        }

        for _ in 0..50 {
            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
                        lsn,
                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
                drop(writer);
                updated[blknum] = lsn;
            }

            // Read all the blocks
            for (blknum, last_lsn) in updated.iter().enumerate() {
                test_key.field6 = blknum as u32;
                assert_eq!(
                    tline.get(test_key, lsn, &ctx).await?,
                    test_img(&format!("{} at {}", blknum, last_lsn))
                );
            }

            // Perform a cycle of flush, and GC
            tline.freeze_and_flush().await?;
            tenant
                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                .await?;
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_traverse_branches() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")
            .await?
            .load()
            .await;
        let mut tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        const NUM_KEYS: usize = 1000;

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

        let cancel = CancellationToken::new();

        // Track when each page was last modified. Used to assert that
        // a read sees the latest page version.
        let mut updated = [Lsn(0); NUM_KEYS];

        let mut lsn = Lsn(0x10);
        #[allow(clippy::needless_range_loop)]
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = blknum as u32;
            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
                    lsn,
                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
            updated[blknum] = lsn;
            drop(writer);

            keyspace.add_key(test_key);
        }

        for _ in 0..50 {
            let new_tline_id = TimelineId::generate();
            tenant
                .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx)
                .await?;
            tline = tenant
                .get_timeline(new_tline_id, true)
                .expect("Should have the branched timeline");

            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
                        lsn,
                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        &ctx,
                    )
                    .await?;
                println!("updating {} at {}", blknum, lsn);
                writer.finish_write(lsn);
                drop(writer);
                updated[blknum] = lsn;
            }

            // Read all the blocks
            for (blknum, last_lsn) in updated.iter().enumerate() {
                test_key.field6 = blknum as u32;
                assert_eq!(
                    tline.get(test_key, lsn, &ctx).await?,
                    test_img(&format!("{} at {}", blknum, last_lsn))
                );
            }

            // Perform a cycle of flush, compact, and GC
            tline.freeze_and_flush().await?;
            tline.compact(&cancel, EnumSet::empty(), &ctx).await?;
            tenant
                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                .await?;
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_traverse_ancestors() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")
            .await?
            .load()
            .await;
        let mut tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        const NUM_KEYS: usize = 100;
        const NUM_TLINES: usize = 50;

        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        // Track page mutation lsns across different timelines.
        let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];

        let mut lsn = Lsn(0x10);

        #[allow(clippy::needless_range_loop)]
        for idx in 0..NUM_TLINES {
            let new_tline_id = TimelineId::generate();
            tenant
                .branch_timeline_test(&tline, new_tline_id, Some(lsn), &ctx)
                .await?;
            tline = tenant
                .get_timeline(new_tline_id, true)
                .expect("Should have the branched timeline");

            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
                        lsn,
                        &Value::Image(test_img(&format!("{} {} at {}", idx, blknum, lsn))),
                        &ctx,
                    )
                    .await?;
                println!("updating [{}][{}] at {}", idx, blknum, lsn);
                writer.finish_write(lsn);
                drop(writer);
                updated[idx][blknum] = lsn;
            }
        }

        // Read pages from leaf timeline across all ancestors.
        for (idx, lsns) in updated.iter().enumerate() {
            for (blknum, lsn) in lsns.iter().enumerate() {
                // Skip empty mutations.
                if lsn.0 == 0 {
                    continue;
                }
                println!("checking [{idx}][{blknum}] at {lsn}");
                test_key.field6 = blknum as u32;
                assert_eq!(
                    tline.get(test_key, *lsn, &ctx).await?,
                    test_img(&format!("{idx} {blknum} at {lsn}"))
                );
            }
        }
        Ok(())
    }

    #[tokio::test]
    async fn test_write_at_initdb_lsn_takes_optimization_code_path() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_empty_test_timeline_is_usable")
            .await?
            .load()
            .await;

        let initdb_lsn = Lsn(0x20);
        let utline = tenant
            .create_empty_timeline(TIMELINE_ID, initdb_lsn, DEFAULT_PG_VERSION, &ctx)
            .await?;
        let tline = utline.raw_timeline().unwrap();

        // Spawn flush loop now so that we can set the `expect_initdb_optimization`
        tline.maybe_spawn_flush_loop();

        // Make sure the timeline has the minimum set of required keys for operation.
        // The only operation you can always do on an empty timeline is to `put` new data.
        // Except if you `put` at `initdb_lsn`.
        // In that case, there's an optimization to directly create image layers instead of delta layers.
        // It uses `repartition()`, which assumes some keys to be present.
        // Let's make sure the test timeline can handle that case.
        {
            let mut state = tline.flush_loop_state.lock().unwrap();
            assert_eq!(
                timeline::FlushLoopState::Running {
                    expect_initdb_optimization: false,
                    initdb_optimization_count: 0,
                },
                *state
            );
            *state = timeline::FlushLoopState::Running {
                expect_initdb_optimization: true,
                initdb_optimization_count: 0,
            };
        }

        // Make writes at the initdb_lsn. When we flush it below, it should be handled by the optimization.
        // As explained above, the optimization requires some keys to be present.
        // As per `create_empty_timeline` documentation, use init_empty to set them.
        // This is what `create_test_timeline` does, by the way.
        let mut modification = tline.begin_modification(initdb_lsn);
        modification
            .init_empty_test_timeline()
            .context("init_empty_test_timeline")?;
        modification
            .commit(&ctx)
            .await
            .context("commit init_empty_test_timeline modification")?;

        // Do the flush. The flush code will check the expectations that we set above.
        tline.freeze_and_flush().await?;

        // assert freeze_and_flush exercised the initdb optimization
        {
            let state = tline.flush_loop_state.lock().unwrap();
            let timeline::FlushLoopState::Running {
                expect_initdb_optimization,
                initdb_optimization_count,
            } = *state
            else {
                panic!("unexpected state: {:?}", *state);
            };
            assert!(expect_initdb_optimization);
            assert!(initdb_optimization_count > 0);
        }
        Ok(())
    }

    #[tokio::test]
    async fn test_create_guard_crash() -> anyhow::Result<()> {
        let name = "test_create_guard_crash";
        let harness = TenantHarness::create(name).await?;
        {
            let (tenant, ctx) = harness.load().await;
            let tline = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown(super::timeline::ShutdownMode::Hard)
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
        }

        let (tenant, _) = harness.load().await;
        match tenant.get_timeline(TIMELINE_ID, false) {
            Ok(_) => panic!("timeline should've been removed during load"),
            Err(e) => {
                assert_eq!(
                    e,
                    GetTimelineError::NotFound {
                        tenant_id: tenant.tenant_shard_id,
                        timeline_id: TIMELINE_ID,
                    }
                )
            }
        }

        assert!(!harness
            .conf
            .timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
            .exists());

        Ok(())
    }

    #[tokio::test]
    async fn test_read_at_max_lsn() -> anyhow::Result<()> {
        let names_algorithms = [
            ("test_read_at_max_lsn_legacy", CompactionAlgorithm::Legacy),
            ("test_read_at_max_lsn_tiered", CompactionAlgorithm::Tiered),
        ];
        for (name, algorithm) in names_algorithms {
            test_read_at_max_lsn_algorithm(name, algorithm).await?;
        }
        Ok(())
    }

    async fn test_read_at_max_lsn_algorithm(
        name: &'static str,
        compaction_algorithm: CompactionAlgorithm,
    ) -> anyhow::Result<()> {
        let mut harness = TenantHarness::create(name).await?;
        harness.tenant_conf.compaction_algorithm = CompactionAlgorithmSettings {
            kind: compaction_algorithm,
        };
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let lsn = Lsn(0x10);
        let compact = false;
        bulk_insert_maybe_compact_gc(&tenant, &tline, &ctx, lsn, 50, 10000, compact).await?;

        let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let read_lsn = Lsn(u64::MAX - 1);

        let result = tline.get(test_key, read_lsn, &ctx).await;
        assert!(result.is_ok(), "result is not Ok: {}", result.unwrap_err());

        Ok(())
    }

    #[tokio::test]
    async fn test_metadata_scan() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_scan").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        const NUM_KEYS: usize = 1000;
        const STEP: usize = 10000; // random update + scan base_key + idx * STEP

        let cancel = CancellationToken::new();

        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        base_key.field1 = AUX_KEY_PREFIX;
        let mut test_key = base_key;

        // Track when each page was last modified. Used to assert that
        // a read sees the latest page version.
        let mut updated = [Lsn(0); NUM_KEYS];

        let mut lsn = Lsn(0x10);
        #[allow(clippy::needless_range_loop)]
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = (blknum * STEP) as u32;
            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
                    lsn,
                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
            updated[blknum] = lsn;
            drop(writer);
        }

        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));

        for iter in 0..=10 {
            // Read all the blocks
            for (blknum, last_lsn) in updated.iter().enumerate() {
                test_key.field6 = (blknum * STEP) as u32;
                assert_eq!(
                    tline.get(test_key, lsn, &ctx).await?,
                    test_img(&format!("{} at {}", blknum, last_lsn))
                );
            }

            let mut cnt = 0;
            for (key, value) in tline
                .get_vectored_impl(
                    keyspace.clone(),
                    lsn,
                    &mut ValuesReconstructState::default(),
                    &ctx,
                )
                .await?
            {
                let blknum = key.field6 as usize;
                let value = value?;
                assert!(blknum % STEP == 0);
                let blknum = blknum / STEP;
                assert_eq!(
                    value,
                    test_img(&format!("{} at {}", blknum, updated[blknum]))
                );
                cnt += 1;
            }

            assert_eq!(cnt, NUM_KEYS);

            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = (blknum * STEP) as u32;
                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
                        lsn,
                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
                drop(writer);
                updated[blknum] = lsn;
            }

            // Perform two cycles of flush, compact, and GC
            for round in 0..2 {
                tline.freeze_and_flush().await?;
                tline
                    .compact(
                        &cancel,
                        if iter % 5 == 0 && round == 0 {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
                            flags
                        } else {
                            EnumSet::empty()
                        },
                        &ctx,
                    )
                    .await?;
                tenant
                    .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                    .await?;
            }
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_metadata_compaction_trigger() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_compaction_trigger").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        let cancel = CancellationToken::new();

        let mut base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        base_key.field1 = AUX_KEY_PREFIX;
        let test_key = base_key;
        let mut lsn = Lsn(0x10);

        for _ in 0..20 {
            lsn = Lsn(lsn.0 + 0x10);
            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
                    lsn,
                    &Value::Image(test_img(&format!("{} at {}", 0, lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
            drop(writer);
            tline.freeze_and_flush().await?; // force create a delta layer
        }

        let before_num_l0_delta_files =
            tline.layers.read().await.layer_map()?.level0_deltas().len();

        tline.compact(&cancel, EnumSet::empty(), &ctx).await?;

        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();

        assert!(after_num_l0_delta_files < before_num_l0_delta_files, "after_num_l0_delta_files={after_num_l0_delta_files}, before_num_l0_delta_files={before_num_l0_delta_files}");

        assert_eq!(
            tline.get(test_key, lsn, &ctx).await?,
            test_img(&format!("{} at {}", 0, lsn))
        );

        Ok(())
    }

    #[tokio::test]
    async fn test_aux_file_e2e() {
        let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();

        let (tenant, ctx) = harness.load().await;

        let mut lsn = Lsn(0x08);

        let tline: Arc<Timeline> = tenant
            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
            .await
            .unwrap();

        {
            lsn += 8;
            let mut modification = tline.begin_modification(lsn);
            modification
                .put_file("pg_logical/mappings/test1", b"first", &ctx)
                .await
                .unwrap();
            modification.commit(&ctx).await.unwrap();
        }

        // we can read everything from the storage
        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(
            files.get("pg_logical/mappings/test1"),
            Some(&bytes::Bytes::from_static(b"first"))
        );

        {
            lsn += 8;
            let mut modification = tline.begin_modification(lsn);
            modification
                .put_file("pg_logical/mappings/test2", b"second", &ctx)
                .await
                .unwrap();
            modification.commit(&ctx).await.unwrap();
        }

        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(
            files.get("pg_logical/mappings/test2"),
            Some(&bytes::Bytes::from_static(b"second"))
        );

        let child = tenant
            .branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(lsn), &ctx)
            .await
            .unwrap();

        let files = child.list_aux_files(lsn, &ctx).await.unwrap();
        assert_eq!(files.get("pg_logical/mappings/test1"), None);
        assert_eq!(files.get("pg_logical/mappings/test2"), None);
    }

    #[tokio::test]
    async fn test_metadata_image_creation() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_image_creation").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;

        const NUM_KEYS: usize = 1000;
        const STEP: usize = 10000; // random update + scan base_key + idx * STEP

        let cancel = CancellationToken::new();

        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
        let mut test_key = base_key;
        let mut lsn = Lsn(0x10);

        async fn scan_with_statistics(
            tline: &Timeline,
            keyspace: &KeySpace,
            lsn: Lsn,
            ctx: &RequestContext,
        ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
            let mut reconstruct_state = ValuesReconstructState::default();
            let res = tline
                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
                .await?;
            Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
        }

        #[allow(clippy::needless_range_loop)]
        for blknum in 0..NUM_KEYS {
            lsn = Lsn(lsn.0 + 0x10);
            test_key.field6 = (blknum * STEP) as u32;
            let mut writer = tline.writer().await;
            writer
                .put(
                    test_key,
                    lsn,
                    &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                    &ctx,
                )
                .await?;
            writer.finish_write(lsn);
            drop(writer);
        }

        let keyspace = KeySpace::single(base_key..base_key.add((NUM_KEYS * STEP) as u32));

        for iter in 1..=10 {
            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
                let blknum = thread_rng().gen_range(0..NUM_KEYS);
                test_key.field6 = (blknum * STEP) as u32;
                let mut writer = tline.writer().await;
                writer
                    .put(
                        test_key,
                        lsn,
                        &Value::Image(test_img(&format!("{} at {}", blknum, lsn))),
                        &ctx,
                    )
                    .await?;
                writer.finish_write(lsn);
                drop(writer);
            }

            tline.freeze_and_flush().await?;

            if iter % 5 == 0 {
                let (_, before_delta_file_accessed) =
                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
                tline
                    .compact(
                        &cancel,
                        {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
                            flags
                        },
                        &ctx,
                    )
                    .await?;
                let (_, after_delta_file_accessed) =
                    scan_with_statistics(&tline, &keyspace, lsn, &ctx).await?;
                assert!(after_delta_file_accessed < before_delta_file_accessed, "after_delta_file_accessed={after_delta_file_accessed}, before_delta_file_accessed={before_delta_file_accessed}");
                // Given that we already produced an image layer, there should be no delta layer needed for the scan, but still setting a low threshold there for unforeseen circumstances.
                assert!(
                    after_delta_file_accessed <= 2,
                    "after_delta_file_accessed={after_delta_file_accessed}"
                );
            }
        }

        Ok(())
    }

    #[tokio::test]
    async fn test_vectored_missing_data_key_reads() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_vectored_missing_data_key_reads").await?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("000000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("000000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("000000000033333333444444445500000002").unwrap();

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                Vec::new(), // delta layers
                vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
        tline.add_extra_test_dense_keyspace(KeySpace::single(base_key..(base_key_nonexist.next())));

        let child = tenant
            .branch_timeline_test_with_layers(
                &tline,
                NEW_TIMELINE_ID,
                Some(Lsn(0x20)),
                &ctx,
                Vec::new(), // delta layers
                vec![(Lsn(0x30), vec![(base_key_child, test_img("data key 2"))])], // image layers
                Lsn(0x30),
            )
            .await
            .unwrap();

        let lsn = Lsn(0x30);

        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
            Some(test_img("data key 1"))
        );
        assert!(get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx)
            .await
            .unwrap_err()
            .is_missing_key_error());
        assert!(
            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx)
                .await
                .unwrap_err()
                .is_missing_key_error()
        );

        // test vectored get on child timeline
        assert_eq!(
            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
            Some(test_img("data key 1"))
        );
        assert_eq!(
            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
            Some(test_img("data key 2"))
        );
        assert!(
            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx)
                .await
                .unwrap_err()
                .is_missing_key_error()
        );

        Ok(())
    }

    #[tokio::test]
    async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads").await?;
        let (tenant, ctx) = harness.load().await;

        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                Vec::new(), // delta layers
                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;

        let child = tenant
            .branch_timeline_test_with_layers(
                &tline,
                NEW_TIMELINE_ID,
                Some(Lsn(0x20)),
                &ctx,
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x30),
                    vec![(base_key_child, test_img("metadata key 2"))],
                )], // image layers
                Lsn(0x30),
            )
            .await
            .unwrap();

        let lsn = Lsn(0x30);

        // test vectored get on parent timeline
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key, lsn, &ctx).await?,
            Some(test_img("metadata key 1"))
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key_child, lsn, &ctx).await?,
            None
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
            None
        );

        // test vectored get on child timeline
        assert_eq!(
            get_vectored_impl_wrapper(&child, base_key, lsn, &ctx).await?,
            None
        );
        assert_eq!(
            get_vectored_impl_wrapper(&child, base_key_child, lsn, &ctx).await?,
            Some(test_img("metadata key 2"))
        );
        assert_eq!(
            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
            None
        );

        Ok(())
    }

    async fn get_vectored_impl_wrapper(
        tline: &Arc<Timeline>,
        key: Key,
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Option<Bytes>, GetVectoredError> {
        let mut reconstruct_state = ValuesReconstructState::new();
        let mut res = tline
            .get_vectored_impl(
                KeySpace::single(key..key.next()),
                lsn,
                &mut reconstruct_state,
                ctx,
            )
            .await?;
        Ok(res.pop_last().map(|(k, v)| {
            assert_eq!(k, key);
            v.unwrap()
        }))
    }

    #[tokio::test]
    async fn test_metadata_tombstone_reads() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_metadata_tombstone_reads").await?;
        let (tenant, ctx) = harness.load().await;
        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();

        // We emulate the situation that the compaction algorithm creates an image layer that removes the tombstones
        // Lsn 0x30 key0, key3, no key1+key2
        // Lsn 0x20 key1+key2 tomestones
        // Lsn 0x10 key1 in image, key2 in delta
        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x10)..Lsn(0x20),
                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                ],
                // image layers
                vec![
                    (Lsn(0x10), vec![(key1, test_img("metadata key 1"))]),
                    (
                        Lsn(0x30),
                        vec![
                            (key0, test_img("metadata key 0")),
                            (key3, test_img("metadata key 3")),
                        ],
                    ),
                ],
                Lsn(0x30),
            )
            .await?;

        let lsn = Lsn(0x30);
        let old_lsn = Lsn(0x20);

        assert_eq!(
            get_vectored_impl_wrapper(&tline, key0, lsn, &ctx).await?,
            Some(test_img("metadata key 0"))
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, key1, lsn, &ctx).await?,
            None,
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, key2, lsn, &ctx).await?,
            None,
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, key1, old_lsn, &ctx).await?,
            Some(Bytes::new()),
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, key2, old_lsn, &ctx).await?,
            Some(Bytes::new()),
        );
        assert_eq!(
            get_vectored_impl_wrapper(&tline, key3, lsn, &ctx).await?,
            Some(test_img("metadata key 3"))
        );

        Ok(())
    }

    #[tokio::test]
    async fn test_metadata_tombstone_image_creation() {
        let harness = TenantHarness::create("test_metadata_tombstone_image_creation")
            .await
            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();
        let key3 = Key::from_hex("620000000033333333444444445500000003").unwrap();

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x10)..Lsn(0x20),
                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x30)..Lsn(0x40),
                        vec![
                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
                        ],
                    ),
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                Lsn(0x40),
            )
            .await
            .unwrap();

        let cancel = CancellationToken::new();

        tline
            .compact(
                &cancel,
                {
                    let mut flags = EnumSet::new();
                    flags.insert(CompactFlags::ForceImageLayerCreation);
                    flags.insert(CompactFlags::ForceRepartition);
                    flags
                },
                &ctx,
            )
            .await
            .unwrap();

        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x40), &ctx)
            .await
            .unwrap()
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();
        assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed.
    }

    #[tokio::test]
    async fn test_metadata_tombstone_empty_image_creation() {
        let harness = TenantHarness::create("test_metadata_tombstone_empty_image_creation")
            .await
            .unwrap();
        let (tenant, ctx) = harness.load().await;

        let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let key2 = Key::from_hex("620000000033333333444444445500000002").unwrap();

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                // delta layers
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x10)..Lsn(0x20),
                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                    DeltaLayerTestDesc::new_with_inferred_key_range(
                        Lsn(0x20)..Lsn(0x30),
                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
                    ),
                ],
                // image layers
                vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
                Lsn(0x30),
            )
            .await
            .unwrap();

        let cancel = CancellationToken::new();

        tline
            .compact(
                &cancel,
                {
                    let mut flags = EnumSet::new();
                    flags.insert(CompactFlags::ForceImageLayerCreation);
                    flags.insert(CompactFlags::ForceRepartition);
                    flags
                },
                &ctx,
            )
            .await
            .unwrap();

        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x30), &ctx)
            .await
            .unwrap()
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();
        assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created
    }

    #[tokio::test]
    async fn test_simple_bottom_most_compaction_images() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_images").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        // We create
        // - one bottom-most image layer,
        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
        // - a delta layer D3 above the horizon.
        //
        //                             | D3 |
        //  | D1 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
        //                             | D3 |
        //  | Part of D1 |
        // --------- img layer with D1+D2 at GC horizon------------------

        // img layer at 0x10
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::Image(Bytes::from("value 1@0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::Image(Bytes::from("value 2@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x40),
                Value::Image(Bytes::from("value 3@0x40")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(5),
                Lsn(0x20),
                Value::Image(Bytes::from("value 5@0x20")),
            ),
            (
                get_key(6),
                Lsn(0x20),
                Value::Image(Bytes::from("value 6@0x20")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::Image(Bytes::from("value 8@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::Image(Bytes::from("value 9@0x48")),
            ),
        ];

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
            .await?;
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x30);
            guard.cutoffs.space = Lsn(0x30);
        }

        let expected_result = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x20"),
            Bytes::from_static(b"value 2@0x30"),
            Bytes::from_static(b"value 3@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x20"),
            Bytes::from_static(b"value 6@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x48"),
            Bytes::from_static(b"value 9@0x48"),
        ];

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
                tline
                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
                    .await
                    .unwrap(),
                expected
            );
        }

        let cancel = CancellationToken::new();
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();

        for (idx, expected) in expected_result.iter().enumerate() {
            assert_eq!(
                tline
                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
                    .await
                    .unwrap(),
                expected
            );
        }

        // Check if the image layer at the GC horizon contains exactly what we want
        let image_at_gc_horizon = tline
            .inspect_image_layers(Lsn(0x30), &ctx)
            .await
            .unwrap()
            .into_iter()
            .filter(|(k, _)| k.is_metadata_key())
            .collect::<Vec<_>>();

        assert_eq!(image_at_gc_horizon.len(), 10);
        let expected_result = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x20"),
            Bytes::from_static(b"value 2@0x30"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x20"),
            Bytes::from_static(b"value 6@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];
        for idx in 0..10 {
            assert_eq!(
                image_at_gc_horizon[idx],
                (get_key(idx as u32), expected_result[idx].clone())
            );
        }

        // Check if old layers are removed / new layers have the expected LSN
        let all_layers = inspect_and_sort(&tline, None).await;
        assert_eq!(
            all_layers,
            vec![
                // Image layer at GC horizon
                PersistentLayerKey {
                    key_range: Key::MIN..Key::MAX,
                    lsn_range: Lsn(0x30)..Lsn(0x31),
                    is_delta: false
                },
                // The delta layer below the horizon
                PersistentLayerKey {
                    key_range: get_key(3)..get_key(4),
                    lsn_range: Lsn(0x30)..Lsn(0x48),
                    is_delta: true
                },
                // The delta3 layer that should not be picked for the compaction
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
                    is_delta: true
                }
            ]
        );

        // increase GC horizon and compact again
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();

        Ok(())
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_neon_test_record() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_neon_test_record").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
            ),
            (
                get_key(1),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
            ),
            (get_key(2), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(2),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(",0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append(",0x30")),
            ),
            (get_key(3), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(3),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_clear()),
            ),
            (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
            (
                get_key(4),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_init()),
            ),
        ];
        let image1 = vec![(get_key(1), "0x10".into())];

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    Lsn(0x10)..Lsn(0x40),
                    delta1,
                )], // delta layers
                vec![(Lsn(0x10), image1)], // image layers
                Lsn(0x50),
            )
            .await?;

        assert_eq!(
            tline.get(get_key(1), Lsn(0x50), &ctx).await?,
            Bytes::from_static(b"0x10,0x20,0x30")
        );
        assert_eq!(
            tline.get(get_key(2), Lsn(0x50), &ctx).await?,
            Bytes::from_static(b"0x10,0x20,0x30")
        );

        // Need to remove the limit of "Neon WAL redo requires base image".

        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());

        Ok(())
    }

    #[tokio::test(start_paused = true)]
    async fn test_lsn_lease() -> anyhow::Result<()> {
        let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
            .await
            .unwrap()
            .load()
            .await;
        // Advance to the lsn lease deadline so that GC is not blocked by
        // initial transition into AttachedSingle.
        tokio::time::advance(tenant.get_lsn_lease_length()).await;
        tokio::time::resume();
        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();

        let end_lsn = Lsn(0x100);
        let image_layers = (0x20..=0x90)
            .step_by(0x10)
            .map(|n| {
                (
                    Lsn(n),
                    vec![(key, test_img(&format!("data key at {:x}", n)))],
                )
            })
            .collect();

        let timeline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                Vec::new(),
                image_layers,
                end_lsn,
            )
            .await?;

        let leased_lsns = [0x30, 0x50, 0x70];
        let mut leases = Vec::new();
        leased_lsns.iter().for_each(|n| {
            leases.push(
                timeline
                    .init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)
                    .expect("lease request should succeed"),
            );
        });

        let updated_lease_0 = timeline
            .renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)
            .expect("lease renewal should succeed");
        assert_eq!(
            updated_lease_0.valid_until, leases[0].valid_until,
            " Renewing with shorter lease should not change the lease."
        );

        let updated_lease_1 = timeline
            .renew_lsn_lease(
                Lsn(leased_lsns[1]),
                timeline.get_lsn_lease_length() * 2,
                &ctx,
            )
            .expect("lease renewal should succeed");
        assert!(
            updated_lease_1.valid_until > leases[1].valid_until,
            "Renewing with a long lease should renew lease with later expiration time."
        );

        // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
        info!(
            "latest_gc_cutoff_lsn: {}",
            *timeline.get_latest_gc_cutoff_lsn()
        );
        timeline.force_set_disk_consistent_lsn(end_lsn);

        let res = tenant
            .gc_iteration(
                Some(TIMELINE_ID),
                0,
                Duration::ZERO,
                &CancellationToken::new(),
                &ctx,
            )
            .await
            .unwrap();

        // Keeping everything <= Lsn(0x80) b/c leases:
        // 0/10: initdb layer
        // (0/20..=0/70).step_by(0x10): image layers added when creating the timeline.
        assert_eq!(res.layers_needed_by_leases, 7);
        // Keeping 0/90 b/c it is the latest layer.
        assert_eq!(res.layers_not_updated, 1);
        // Removed 0/80.
        assert_eq!(res.layers_removed, 1);

        // Make lease on a already GC-ed LSN.
        // 0/80 does not have a valid lease + is below latest_gc_cutoff
        assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
        timeline
            .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
            .expect_err("lease request on GC-ed LSN should fail");

        // Should still be able to renew a currently valid lease
        // Assumption: original lease to is still valid for 0/50.
        // (use `Timeline::init_lsn_lease` for testing so it always does validation)
        timeline
            .init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)
            .expect("lease renewal with validation should succeed");

        Ok(())
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        // We create
        // - one bottom-most image layer,
        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
        // - a delta layer D3 above the horizon.
        //
        //                             | D3 |
        //  | D1 |
        // -|    |-- gc horizon -----------------
        //  |    |                | D2 |
        // --------- img layer ------------------
        //
        // What we should expact from this compaction is:
        //                             | D3 |
        //  | Part of D1 |
        // --------- img layer with D1+D2 at GC horizon------------------

        // img layer at 0x10
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x28),
                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
            ),
            (
                get_key(3),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(5),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(6),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
            .await?;
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }

        let expected_result = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10@0x48"),
            Bytes::from_static(b"value 9@0x10@0x48"),
        ];

        let expected_result_at_gc_horizon = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        for idx in 0..10 {
            assert_eq!(
                tline
                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
                    .await
                    .unwrap(),
                &expected_result[idx]
            );
            assert_eq!(
                tline
                    .get(get_key(idx as u32), Lsn(0x30), &ctx)
                    .await
                    .unwrap(),
                &expected_result_at_gc_horizon[idx]
            );
        }

        let cancel = CancellationToken::new();
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();

        for idx in 0..10 {
            assert_eq!(
                tline
                    .get(get_key(idx as u32), Lsn(0x50), &ctx)
                    .await
                    .unwrap(),
                &expected_result[idx]
            );
            assert_eq!(
                tline
                    .get(get_key(idx as u32), Lsn(0x30), &ctx)
                    .await
                    .unwrap(),
                &expected_result_at_gc_horizon[idx]
            );
        }

        // increase GC horizon and compact again
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x40);
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();

        Ok(())
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_generate_key_retention() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_generate_key_retention").await?;
        let (tenant, ctx) = harness.load().await;
        let tline = tenant
            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
            .await?;
        tline.force_advance_lsn(Lsn(0x70));
        let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
        let history = vec![
            (
                key,
                Lsn(0x10),
                Value::Image(Bytes::copy_from_slice(b"0x10")),
            ),
            (
                key,
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
            ),
            (
                key,
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
            ),
            (
                key,
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
            ),
            (
                key,
                Lsn(0x50),
                Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
            ),
            (
                key,
                Lsn(0x60),
                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
            ),
            (
                key,
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            ),
            (
                key,
                Lsn(0x80),
                Value::Image(Bytes::copy_from_slice(
                    b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
                )),
            ),
            (
                key,
                Lsn(0x90),
                Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
            ),
        ];
        let res = tline
            .generate_key_retention(
                key,
                &history,
                Lsn(0x60),
                &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                3,
                None,
            )
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
            below_horizon: vec![
                (
                    Lsn(0x20),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
                    )]),
                ),
                (
                    Lsn(0x40),
                    KeyLogAtLsn(vec![
                        (
                            Lsn(0x30),
                            Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
                        ),
                        (
                            Lsn(0x40),
                            Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
                        ),
                    ]),
                ),
                (
                    Lsn(0x50),
                    KeyLogAtLsn(vec![(
                        Lsn(0x50),
                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40;0x50")),
                    )]),
                ),
                (
                    Lsn(0x60),
                    KeyLogAtLsn(vec![(
                        Lsn(0x60),
                        Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
                    )]),
                ),
            ],
            above_horizon: KeyLogAtLsn(vec![
                (
                    Lsn(0x70),
                    Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
                ),
                (
                    Lsn(0x80),
                    Value::Image(Bytes::copy_from_slice(
                        b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
                    )),
                ),
                (
                    Lsn(0x90),
                    Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
                ),
            ]),
        };
        assert_eq!(res, expected_res);

        // We expect GC-compaction to run with the original GC. This would create a situation that
        // the original GC algorithm removes some delta layers b/c there are full image coverage,
        // therefore causing some keys to have an incomplete history below the lowest retain LSN.
        // For example, we have
        // ```plain
        // init delta @ 0x10, image @ 0x20, delta @ 0x30 (gc_horizon), image @ 0x40.
        // ```
        // Now the GC horizon moves up, and we have
        // ```plain
        // init delta @ 0x10, image @ 0x20, delta @ 0x30, image @ 0x40 (gc_horizon)
        // ```
        // The original GC algorithm kicks in, and removes delta @ 0x10, image @ 0x20.
        // We will end up with
        // ```plain
        // delta @ 0x30, image @ 0x40 (gc_horizon)
        // ```
        // Now we run the GC-compaction, and this key does not have a full history.
        // We should be able to handle this partial history and drop everything before the
        // gc_horizon image.

        let history = vec![
            (
                key,
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
            ),
            (
                key,
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
            ),
            (
                key,
                Lsn(0x40),
                Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")),
            ),
            (
                key,
                Lsn(0x50),
                Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
            ),
            (
                key,
                Lsn(0x60),
                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
            ),
            (
                key,
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            ),
            (
                key,
                Lsn(0x80),
                Value::Image(Bytes::copy_from_slice(
                    b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
                )),
            ),
            (
                key,
                Lsn(0x90),
                Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
            ),
        ];
        let res = tline
            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
            below_horizon: vec![
                (
                    Lsn(0x40),
                    KeyLogAtLsn(vec![(
                        Lsn(0x40),
                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")),
                    )]),
                ),
                (
                    Lsn(0x50),
                    KeyLogAtLsn(vec![(
                        Lsn(0x50),
                        Value::WalRecord(NeonWalRecord::wal_append(";0x50")),
                    )]),
                ),
                (
                    Lsn(0x60),
                    KeyLogAtLsn(vec![(
                        Lsn(0x60),
                        Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
                    )]),
                ),
            ],
            above_horizon: KeyLogAtLsn(vec![
                (
                    Lsn(0x70),
                    Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
                ),
                (
                    Lsn(0x80),
                    Value::Image(Bytes::copy_from_slice(
                        b"0x10;0x20;0x30;0x40;0x50;0x60;0x70;0x80",
                    )),
                ),
                (
                    Lsn(0x90),
                    Value::WalRecord(NeonWalRecord::wal_append(";0x90")),
                ),
            ]),
        };
        assert_eq!(res, expected_res);

        // In case of branch compaction, the branch itself does not have the full history, and we need to provide
        // the ancestor image in the test case.

        let history = vec![
            (
                key,
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
            ),
            (
                key,
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append(";0x30")),
            ),
            (
                key,
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
            ),
            (
                key,
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            ),
        ];
        let res = tline
            .generate_key_retention(
                key,
                &history,
                Lsn(0x60),
                &[],
                3,
                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
            )
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
            below_horizon: vec![(
                Lsn(0x60),
                KeyLogAtLsn(vec![(
                    Lsn(0x60),
                    Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x30;0x40")), // use the ancestor image to reconstruct the page
                )]),
            )],
            above_horizon: KeyLogAtLsn(vec![(
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            )]),
        };
        assert_eq!(res, expected_res);

        let history = vec![
            (
                key,
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
            ),
            (
                key,
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append(";0x40")),
            ),
            (
                key,
                Lsn(0x60),
                Value::WalRecord(NeonWalRecord::wal_append(";0x60")),
            ),
            (
                key,
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            ),
        ];
        let res = tline
            .generate_key_retention(
                key,
                &history,
                Lsn(0x60),
                &[Lsn(0x30)],
                3,
                Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
            )
            .await
            .unwrap();
        let expected_res = KeyHistoryRetention {
            below_horizon: vec![
                (
                    Lsn(0x30),
                    KeyLogAtLsn(vec![(
                        Lsn(0x20),
                        Value::WalRecord(NeonWalRecord::wal_append(";0x20")),
                    )]),
                ),
                (
                    Lsn(0x60),
                    KeyLogAtLsn(vec![(
                        Lsn(0x60),
                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20;0x40;0x60")),
                    )]),
                ),
            ],
            above_horizon: KeyLogAtLsn(vec![(
                Lsn(0x70),
                Value::WalRecord(NeonWalRecord::wal_append(";0x70")),
            )]),
        };
        assert_eq!(res, expected_res);

        Ok(())
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
        let harness =
            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x28),
                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
            ),
            (
                get_key(3),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(5),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(6),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
            .await?;
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![
                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }

        let expected_result = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10@0x48"),
            Bytes::from_static(b"value 9@0x10@0x48"),
        ];

        let expected_result_at_gc_horizon = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let expected_result_at_lsn_20 = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let expected_result_at_lsn_10 = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10"),
            Bytes::from_static(b"value 2@0x10"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10"),
            Bytes::from_static(b"value 6@0x10"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
                        .await
                        .unwrap(),
                    &expected_result[idx]
                );
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), gc_horizon, &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
                );
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_lsn_20[idx]
                );
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_lsn_10[idx]
                );
            }
        };

        verify_result().await;

        let cancel = CancellationToken::new();
        let mut dryrun_flags = EnumSet::new();
        dryrun_flags.insert(CompactFlags::DryRun);

        tline
            .compact_with_gc(&cancel, dryrun_flags, &ctx)
            .await
            .unwrap();
        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
        verify_result().await;

        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        verify_result().await;

        // compact again
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        verify_result().await;

        // increase GC horizon and compact again
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            guard.cutoffs.time = Lsn(0x38);
            guard.cutoffs.space = Lsn(0x38);
        }
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result

        // not increasing the GC horizon and compact again
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        verify_result().await;

        Ok(())
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
    {
        let harness =
            TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
                .await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(1),
                Lsn(0x28),
                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(1),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(1),
                Lsn(0x38),
                Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![
                    // delta1 and delta 2 only contain a single key but multiple updates
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
            .await?;
        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![
                    (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No),
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }

        let expected_result = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
            Bytes::from_static(b"value 2@0x10"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10"),
            Bytes::from_static(b"value 6@0x10"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10@0x48"),
            Bytes::from_static(b"value 9@0x10@0x48"),
        ];

        let expected_result_at_gc_horizon = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
            Bytes::from_static(b"value 2@0x10"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10"),
            Bytes::from_static(b"value 6@0x10"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let expected_result_at_lsn_20 = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10"),
            Bytes::from_static(b"value 6@0x10"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let expected_result_at_lsn_10 = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10"),
            Bytes::from_static(b"value 2@0x10"),
            Bytes::from_static(b"value 3@0x10"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10"),
            Bytes::from_static(b"value 6@0x10"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
                gc_info.cutoffs.time
            };
            for idx in 0..10 {
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
                        .await
                        .unwrap(),
                    &expected_result[idx]
                );
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), gc_horizon, &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
                );
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), Lsn(0x20), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_lsn_20[idx]
                );
                assert_eq!(
                    tline
                        .get(get_key(idx as u32), Lsn(0x10), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_lsn_10[idx]
                );
            }
        };

        verify_result().await;

        let cancel = CancellationToken::new();
        let mut dryrun_flags = EnumSet::new();
        dryrun_flags.insert(CompactFlags::DryRun);

        tline
            .compact_with_gc(&cancel, dryrun_flags, &ctx)
            .await
            .unwrap();
        // We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
        // cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
        verify_result().await;

        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        verify_result().await;

        // compact again
        tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        verify_result().await;

        Ok(())
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x28),
                Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
            ),
            (
                get_key(3),
                Lsn(0x30),
                Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x40),
                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(5),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
            (
                get_key(6),
                Lsn(0x20),
                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
            ),
        ];

        let parent_tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![],                       // delta layers
                vec![(Lsn(0x18), img_layer)], // image layers
                Lsn(0x18),
            )
            .await?;

        parent_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));

        let branch_tline = tenant
            .branch_timeline_test_with_layers(
                &parent_tline,
                NEW_TIMELINE_ID,
                Some(Lsn(0x18)),
                &ctx,
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![], // image layers
                Lsn(0x50),
            )
            .await?;

        branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));

        {
            // Update GC info
            let mut guard = parent_tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x10),
                    space: Lsn(0x10),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }

        {
            // Update GC info
            let mut guard = branch_tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x50),
                    space: Lsn(0x50),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }

        let expected_result_at_gc_horizon = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10@0x48"),
            Bytes::from_static(b"value 9@0x10@0x48"),
        ];

        let expected_result_at_lsn_40 = [
            Bytes::from_static(b"value 0@0x10"),
            Bytes::from_static(b"value 1@0x10@0x20"),
            Bytes::from_static(b"value 2@0x10@0x30"),
            Bytes::from_static(b"value 3@0x10@0x28@0x30@0x40"),
            Bytes::from_static(b"value 4@0x10"),
            Bytes::from_static(b"value 5@0x10@0x20"),
            Bytes::from_static(b"value 6@0x10@0x20"),
            Bytes::from_static(b"value 7@0x10"),
            Bytes::from_static(b"value 8@0x10"),
            Bytes::from_static(b"value 9@0x10"),
        ];

        let verify_result = || async {
            for idx in 0..10 {
                assert_eq!(
                    branch_tline
                        .get(get_key(idx as u32), Lsn(0x50), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_gc_horizon[idx]
                );
                assert_eq!(
                    branch_tline
                        .get(get_key(idx as u32), Lsn(0x40), &ctx)
                        .await
                        .unwrap(),
                    &expected_result_at_lsn_40[idx]
                );
            }
        };

        verify_result().await;

        let cancel = CancellationToken::new();
        branch_tline
            .compact_with_gc(&cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();

        verify_result().await;

        Ok(())
    }

    // Regression test for https://github.com/neondatabase/neon/issues/9012
    // Create an image arrangement where we have to read at different LSN ranges
    // from a delta layer. This is achieved by overlapping an image layer on top of
    // a delta layer. Like so:
    //
    //     A      B
    // +----------------+ -> delta_layer
    // |                |                           ^ lsn
    // |       =========|-> nested_image_layer      |
    // |       C        |                           |
    // +----------------+                           |
    // ======== -> baseline_image_layer             +-------> key
    //
    //
    // When querying the key range [A, B) we need to read at different LSN ranges
    // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
        let (tenant, ctx) = harness.load().await;

        let will_init_keys = [2, 6];
        fn get_key(id: u32) -> Key {
            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        let mut expected_key_values = HashMap::new();

        let baseline_image_layer_lsn = Lsn(0x10);
        let mut baseline_img_layer = Vec::new();
        for i in 0..5 {
            let key = get_key(i);
            let value = format!("value {i}@{baseline_image_layer_lsn}");

            let removed = expected_key_values.insert(key, value.clone());
            assert!(removed.is_none());

            baseline_img_layer.push((key, Bytes::from(value)));
        }

        let nested_image_layer_lsn = Lsn(0x50);
        let mut nested_img_layer = Vec::new();
        for i in 5..10 {
            let key = get_key(i);
            let value = format!("value {i}@{nested_image_layer_lsn}");

            let removed = expected_key_values.insert(key, value.clone());
            assert!(removed.is_none());

            nested_img_layer.push((key, Bytes::from(value)));
        }

        let mut delta_layer_spec = Vec::default();
        let delta_layer_start_lsn = Lsn(0x20);
        let mut delta_layer_end_lsn = delta_layer_start_lsn;

        for i in 0..10 {
            let key = get_key(i);
            let key_in_nested = nested_img_layer
                .iter()
                .any(|(key_with_img, _)| *key_with_img == key);
            let lsn = {
                if key_in_nested {
                    Lsn(nested_image_layer_lsn.0 + 0x10)
                } else {
                    delta_layer_start_lsn
                }
            };

            let will_init = will_init_keys.contains(&i);
            if will_init {
                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));

                expected_key_values.insert(key, "".to_string());
            } else {
                let delta = format!("@{lsn}");
                delta_layer_spec.push((
                    key,
                    lsn,
                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
                ));

                expected_key_values
                    .get_mut(&key)
                    .expect("An image exists for each key")
                    .push_str(delta.as_str());
            }
            delta_layer_end_lsn = std::cmp::max(delta_layer_start_lsn, lsn);
        }

        delta_layer_end_lsn = Lsn(delta_layer_end_lsn.0 + 1);

        assert!(
            nested_image_layer_lsn > delta_layer_start_lsn
                && nested_image_layer_lsn < delta_layer_end_lsn
        );

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                baseline_image_layer_lsn,
                DEFAULT_PG_VERSION,
                &ctx,
                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                    delta_layer_start_lsn..delta_layer_end_lsn,
                    delta_layer_spec,
                )], // delta layers
                vec![
                    (baseline_image_layer_lsn, baseline_img_layer),
                    (nested_image_layer_lsn, nested_img_layer),
                ], // image layers
                delta_layer_end_lsn,
            )
            .await?;

        let keyspace = KeySpace::single(get_key(0)..get_key(10));
        let results = tline
            .get_vectored(keyspace, delta_layer_end_lsn, &ctx)
            .await
            .expect("No vectored errors");
        for (key, res) in results {
            let value = res.expect("No key errors");
            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
            assert_eq!(value, Bytes::from(expected_value));
        }

        Ok(())
    }

    async fn inspect_and_sort(
        tline: &Arc<Timeline>,
        filter: Option<std::ops::Range<Key>>,
    ) -> Vec<PersistentLayerKey> {
        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
        if let Some(filter) = filter {
            all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
        }
        all_layers.sort_by(|k1, k2| {
            (
                k1.is_delta,
                k1.key_range.start,
                k1.key_range.end,
                k1.lsn_range.start,
                k1.lsn_range.end,
            )
                .cmp(&(
                    k2.is_delta,
                    k2.key_range.start,
                    k2.key_range.end,
                    k2.lsn_range.start,
                    k2.lsn_range.end,
                ))
        });
        all_layers
    }

    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_simple_partial_bottom_most_compaction").await?;
        let (tenant, ctx) = harness.load().await;

        fn get_key(id: u32) -> Key {
            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
            key.field6 = id;
            key
        }

        // img layer at 0x10
        let img_layer = (0..10)
            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
            .collect_vec();

        let delta1 = vec![
            (
                get_key(1),
                Lsn(0x20),
                Value::Image(Bytes::from("value 1@0x20")),
            ),
            (
                get_key(2),
                Lsn(0x30),
                Value::Image(Bytes::from("value 2@0x30")),
            ),
            (
                get_key(3),
                Lsn(0x40),
                Value::Image(Bytes::from("value 3@0x40")),
            ),
        ];
        let delta2 = vec![
            (
                get_key(5),
                Lsn(0x20),
                Value::Image(Bytes::from("value 5@0x20")),
            ),
            (
                get_key(6),
                Lsn(0x20),
                Value::Image(Bytes::from("value 6@0x20")),
            ),
        ];
        let delta3 = vec![
            (
                get_key(8),
                Lsn(0x48),
                Value::Image(Bytes::from("value 8@0x48")),
            ),
            (
                get_key(9),
                Lsn(0x48),
                Value::Image(Bytes::from("value 9@0x48")),
            ),
        ];

        let tline = tenant
            .create_test_timeline_with_layers(
                TIMELINE_ID,
                Lsn(0x10),
                DEFAULT_PG_VERSION,
                &ctx,
                vec![
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
                ], // delta layers
                vec![(Lsn(0x10), img_layer)], // image layers
                Lsn(0x50),
            )
            .await?;

        {
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
                    time: Lsn(0x30),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
                within_ancestor_pitr: false,
            };
        }

        let cancel = CancellationToken::new();

        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
        // can be removed because they might be used for other key ranges.
        tline
            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
        assert_eq!(
            all_layers,
            vec![
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(1)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
                    is_delta: true
                }
            ]
        );

        // Do a partial compaction on key range 4..10
        tline
            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
        assert_eq!(
            all_layers,
            vec![
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
                    is_delta: false
                },
                PersistentLayerKey {
                    // if (in the future) GC kicks in, this layer will be removed
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x10)..Lsn(0x11),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(4)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(1)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
                    is_delta: true
                }
            ]
        );

        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
        tline
            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
            .await
            .unwrap();
        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
        assert_eq!(
            all_layers,
            vec![
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(0)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(4)..get_key(10),
                    lsn_range: Lsn(0x20)..Lsn(0x21),
                    is_delta: false
                },
                PersistentLayerKey {
                    key_range: get_key(1)..get_key(4),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(5)..get_key(7),
                    lsn_range: Lsn(0x20)..Lsn(0x48),
                    is_delta: true
                },
                PersistentLayerKey {
                    key_range: get_key(8)..get_key(10),
                    lsn_range: Lsn(0x48)..Lsn(0x50),
                    is_delta: true
                }
            ]
        );

        Ok(())
    }
}