Merge branch 'problame/hung-shutdown/demo-hypothesis' into problame/hung-shutdown/fix

2026-05-19 06:00:38 +00:00 · 2025-01-14 23:51:53 +01:00
parent 4e094d9638 a8f9b564be
commit 366ff9ffcc
89 changed files with 4873 additions and 2437 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,10 +53,12 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

-/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+/// Configure jemalloc to profile heap allocations by sampling stack traces every 2 MB (1 << 21).
+/// This adds roughly 3% overhead for allocations on average, which is acceptable considering
+/// performance-sensitive code will avoid allocations as far as possible anyway.
 #[allow(non_upper_case_globals)]
 #[export_name = "malloc_conf"]
-pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";

 const PID_FILE_NAME: &str = "pageserver.pid";

--- a/pageserver/src/bin/test_helper_slow_client_reads.rs
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -0,0 +1,65 @@
+use std::{
+    io::{stdin, stdout, Read, Write},
+    time::Duration,
+};
+
+use clap::Parser;
+use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+#[derive(clap::Parser)]
+struct Args {
+    connstr: String,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let Args {
+        connstr,
+        tenant_id,
+        timeline_id,
+    } = Args::parse();
+    let client = pageserver_client::page_service::Client::new(connstr).await?;
+    let client = client.pagestream(tenant_id, timeline_id).await?;
+    let (mut sender, _receiver) = client.split();
+
+    eprintln!("filling the pipe");
+    let mut msg = 0;
+    loop {
+        msg += 1;
+        let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
+            PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(23),
+                    not_modified_since: Lsn(23),
+                },
+                batch_key: 42,
+                message: format!("message {}", msg),
+            },
+        ));
+        let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
+            eprintln!("pipe seems full");
+            break;
+        };
+        let _: () = res?;
+    }
+
+    let n = stdout().write(b"R")?;
+    assert_eq!(n, 1);
+    stdout().flush()?;
+
+    eprintln!("waiting for signal to tell us to exit");
+
+    let mut buf = [0u8; 1];
+    stdin().read_exact(&mut buf)?;
+
+    eprintln!("termination signal received, exiting");
+
+    anyhow::Ok(())
+}
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -97,8 +97,8 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    CompactInfoResponse, StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest,
-    TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
    auth::SwappableJwtAuth,
@@ -2052,15 +2052,7 @@ async fn timeline_compact_info_handler(
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
-        let res = tenant.get_scheduled_compaction_tasks(timeline_id);
-        let mut resp = Vec::new();
-        for item in res {
-            resp.push(CompactInfoResponse {
-                compact_key_range: item.compact_key_range,
-                compact_lsn_range: item.compact_lsn_range,
-                sub_compaction: item.sub_compaction,
-            });
-        }
+        let resp = tenant.get_scheduled_compaction_tasks(timeline_id);
        json_response(StatusCode::OK, resp)
    }
    .instrument(info_span!("timeline_compact_info", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -91,15 +91,6 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_layers_visited_per_read_global",
-        "Number of layers visited to reconstruct one key",
-        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_layers_visited_per_vectored_read_global",
@@ -1238,6 +1229,16 @@ pub(crate) struct SmgrOpTimerInner {
    timings: SmgrOpTimerState,
 }

+/// The stages of request processing are represented by the enum variants.
+/// Used as part of [`SmgrOpTimerInner::timings`].
+///
+/// Request processing calls into the `SmgrOpTimer::observe_*` methods at the
+/// transition points.
+/// These methods bump relevant counters and then update [`SmgrOpTimerInner::timings`]
+/// to the next state.
+///
+/// Each request goes through every stage, in all configurations.
+///
 #[derive(Debug)]
 enum SmgrOpTimerState {
    Received {
@@ -1247,13 +1248,13 @@ enum SmgrOpTimerState {
        #[allow(dead_code)]
        received_at: Instant,
    },
-    ParsedRoutedNowThrottling {
+    Throttling {
        throttle_started_at: Instant,
    },
-    ParsedRoutedThrottledNowBatching {
+    Batching {
        throttle_done_at: Instant,
    },
-    BatchedNowExecuting {
+    Executing {
        execution_started_at: Instant,
    },
    Flushing,
@@ -1262,6 +1263,7 @@ enum SmgrOpTimerState {

 // NB: when adding observation points, remember to update the Drop impl.
 impl SmgrOpTimer {
+    /// See [`SmgrOpTimerState`] for more context.
    pub(crate) fn observe_throttle_start(&mut self, at: Instant) {
        let Some(inner) = self.0.as_mut() else {
            return;
@@ -1270,24 +1272,26 @@ impl SmgrOpTimer {
            return;
        };
        inner.throttling.count_accounted_start.inc();
-        inner.timings = SmgrOpTimerState::ParsedRoutedNowThrottling {
+        inner.timings = SmgrOpTimerState::Throttling {
            throttle_started_at: at,
        };
    }
+
+    /// See [`SmgrOpTimerState`] for more context.
    pub(crate) fn observe_throttle_done(&mut self, throttle: ThrottleResult) {
        let Some(inner) = self.0.as_mut() else {
            return;
        };
-        let SmgrOpTimerState::ParsedRoutedNowThrottling {
+        let SmgrOpTimerState::Throttling {
            throttle_started_at,
-        } = &mut inner.timings
+        } = &inner.timings
        else {
            return;
        };
        inner.throttling.count_accounted_finish.inc();
        match throttle {
            ThrottleResult::NotThrottled { end } => {
-                inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
+                inner.timings = SmgrOpTimerState::Batching {
                    throttle_done_at: end,
                };
            }
@@ -1299,20 +1303,19 @@ impl SmgrOpTimer {
                    .wait_time
                    .inc_by((end - *throttle_started_at).as_micros().try_into().unwrap());
                // state transition
-                inner.timings = SmgrOpTimerState::ParsedRoutedThrottledNowBatching {
+                inner.timings = SmgrOpTimerState::Batching {
                    throttle_done_at: end,
                };
            }
        }
    }

+    /// See [`SmgrOpTimerState`] for more context.
    pub(crate) fn observe_execution_start(&mut self, at: Instant) {
        let Some(inner) = self.0.as_mut() else {
            return;
        };
-        let SmgrOpTimerState::ParsedRoutedThrottledNowBatching { throttle_done_at } =
-            &mut inner.timings
-        else {
+        let SmgrOpTimerState::Batching { throttle_done_at } = &inner.timings else {
            return;
        };
        // update metrics
@@ -1322,13 +1325,15 @@ impl SmgrOpTimer {
            .per_timeline_batch_wait_time
            .observe(batch.as_secs_f64());
        // state transition
-        inner.timings = SmgrOpTimerState::BatchedNowExecuting {
+        inner.timings = SmgrOpTimerState::Executing {
            execution_started_at: at,
        }
    }

    /// For all but the first caller, this is a no-op.
    /// The first callers receives Some, subsequent ones None.
+    ///
+    /// See [`SmgrOpTimerState`] for more context.
    pub(crate) fn observe_execution_end_flush_start(
        &mut self,
        at: Instant,
@@ -1338,9 +1343,9 @@ impl SmgrOpTimer {
        let Some(mut inner) = self.0.take() else {
            return None;
        };
-        let SmgrOpTimerState::BatchedNowExecuting {
+        let SmgrOpTimerState::Executing {
            execution_started_at,
-        } = &mut inner.timings
+        } = &inner.timings
        else {
            return None;
        };
@@ -1373,6 +1378,14 @@ impl SmgrOpTimer {
    }
 }

+/// The last stage of request processing is serializing and flushing the request
+/// into the TCP connection. We want to make slow flushes observable
+/// _while they are occuring_, so this struct provides a wrapper method [`Self::measure`]
+/// to periodically bump the metric.
+///
+/// If in the future we decide that we're not interested in live updates, we can
+/// add another `observe_*` method to [`SmgrOpTimer`], follow the existing pattern there,
+/// and remove this struct from the code base.
 pub(crate) struct SmgrOpFlushInProgress {
    flush_started_at: Instant,
    global_micros: IntCounter,
@@ -1450,6 +1463,8 @@ pub enum SmgrQueryType {
    GetPageAtLsn,
    GetDbSize,
    GetSlruSegment,
+    #[cfg(feature = "testing")]
+    Test,
 }

 pub(crate) struct SmgrQueryTimePerTimeline {
@@ -3863,7 +3878,6 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {

    // histograms
    [
-        &READ_NUM_LAYERS_VISITED,
        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -554,6 +554,12 @@ struct BatchedGetPageRequest {
    timer: SmgrOpTimer,
 }

+#[cfg(feature = "testing")]
+struct BatchedTestRequest {
+    req: models::PagestreamTestRequest,
+    timer: SmgrOpTimer,
+}
+
 enum BatchedFeMessage {
    Exists {
        span: Span,
@@ -585,6 +591,12 @@ enum BatchedFeMessage {
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamGetSlruSegmentRequest,
    },
+    #[cfg(feature = "testing")]
+    Test {
+        span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
+        requests: Vec<BatchedTestRequest>,
+    },
    RespondError {
        span: Span,
        error: BatchedPageStreamError,
@@ -605,6 +617,12 @@ impl BatchedFeMessage {
                    page.timer.observe_execution_start(at);
                }
            }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test { requests, .. } => {
+                for req in requests {
+                    req.timer.observe_execution_start(at);
+                }
+            }
            BatchedFeMessage::RespondError { .. } => {}
        }
    }
@@ -866,6 +884,22 @@ impl PageServerHandler {
                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
                }
            }
+            #[cfg(feature = "testing")]
+            PagestreamFeMessage::Test(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                let timer =
+                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                        .await?;
+                BatchedFeMessage::Test {
+                    span,
+                    shard,
+                    requests: vec![BatchedTestRequest { req, timer }],
+                }
+            }
        };
        Ok(Some(batched_msg))
    }
@@ -926,6 +960,46 @@ impl PageServerHandler {
                accum_pages.extend(this_pages);
                Ok(())
            }
+            #[cfg(feature = "testing")]
+            (
+                Ok(BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                }),
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) if (|| {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return false;
+                }
+                if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
+                    != (this_shard.tenant_shard_id, this_shard.timeline_id)
+                {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return false;
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return false;
+                }
+                true
+            })() =>
+            {
+                // ok to batch
+                accum_requests.extend(this_requests);
+                Ok(())
+            }
            // something batched already but this message is unbatchable
            (_, this_msg) => {
                // by default, don't continue batching
@@ -1052,6 +1126,27 @@ impl PageServerHandler {
                    span,
                )
            }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test {
+                span,
+                shard,
+                requests,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::test");
+                (
+                    {
+                        let npages = requests.len();
+                        trace!(npages, "handling getpage request");
+                        let res = self
+                            .handle_test_request_batch(&shard, requests, ctx)
+                            .instrument(span.clone())
+                            .await;
+                        assert_eq!(res.len(), npages);
+                        res
+                    },
+                    span,
+                )
+            }
            BatchedFeMessage::RespondError { span, error } => {
                // We've already decided to respond with an error, so we don't need to
                // call the handler.
@@ -1775,6 +1870,51 @@ impl PageServerHandler {
        ))
    }

+    // NB: this impl mimics what we do for batched getpage requests.
+    #[cfg(feature = "testing")]
+    #[instrument(skip_all, fields(shard_id))]
+    async fn handle_test_request_batch(
+        &mut self,
+        timeline: &Timeline,
+        requests: Vec<BatchedTestRequest>,
+        _ctx: &RequestContext,
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
+        // real requests would do something with the timeline
+        let mut results = Vec::with_capacity(requests.len());
+        for _req in requests.iter() {
+            tokio::task::yield_now().await;
+
+            results.push({
+                if timeline.cancel.is_cancelled() {
+                    Err(PageReconstructError::Cancelled)
+                } else {
+                    Ok(())
+                }
+            });
+        }
+
+        // TODO: avoid creating the new Vec here
+        Vec::from_iter(
+            requests
+                .into_iter()
+                .zip(results.into_iter())
+                .map(|(req, res)| {
+                    res.map(|()| {
+                        (
+                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
+                                req: req.req.clone(),
+                            }),
+                            req.timer,
+                        )
+                    })
+                    .map_err(|e| BatchedPageStreamError {
+                        err: PageStreamError::from(e),
+                        req: req.req.hdr,
+                    })
+                }),
+        )
+    }
+
    /// Note on "fullbackup":
    /// Full basebackups should only be used for debugging purposes.
    /// Originally, it was introduced to enable breaking storage format changes,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -21,6 +21,7 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -37,21 +38,17 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
-use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
-use timeline::compaction::GcCompactJob;
-use timeline::compaction::ScheduledCompactionTask;
+use timeline::compaction::GcCompactionQueue;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::offload::OffloadError;
-use timeline::CompactFlags;
 use timeline::CompactOptions;
-use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -347,10 +344,8 @@ pub struct Tenant {
    /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
    compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,

-    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
-    /// a manual gc-compaction from the manual compaction API.
-    scheduled_compaction_tasks:
-        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+    /// Scheduled gc-compaction tasks.
+    scheduled_compaction_tasks: std::sync::Mutex<HashMap<TimelineId, Arc<GcCompactionQueue>>>,

    /// If the tenant is in Activating state, notify this to encourage it
    /// to proceed to Active as soon as possible, rather than waiting for lazy
@@ -2999,104 +2994,18 @@ impl Tenant {
                if has_pending_l0_compaction_task {
                    Some(true)
                } else {
-                    let mut has_pending_scheduled_compaction_task;
-                    let next_scheduled_compaction_task = {
-                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
-                            if !tline_pending_tasks.is_empty() {
-                                info!(
-                                    "{} tasks left in the compaction schedule queue",
-                                    tline_pending_tasks.len()
-                                );
-                            }
-                            let next_task = tline_pending_tasks.pop_front();
-                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
-                            next_task
-                        } else {
-                            has_pending_scheduled_compaction_task = false;
-                            None
-                        }
+                    let queue = {
+                        let guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        guard.get(timeline_id).cloned()
                    };
-                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
-                    {
-                        if !next_scheduled_compaction_task
-                            .options
-                            .flags
-                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
-                        {
-                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
-                        } else if next_scheduled_compaction_task.options.sub_compaction {
-                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
-                            let jobs: Vec<GcCompactJob> = timeline
-                                .gc_compaction_split_jobs(
-                                    GcCompactJob::from_compact_options(
-                                        next_scheduled_compaction_task.options.clone(),
-                                    ),
-                                    next_scheduled_compaction_task
-                                        .options
-                                        .sub_compaction_max_job_size_mb,
-                                )
-                                .await
-                                .map_err(CompactionError::Other)?;
-                            if jobs.is_empty() {
-                                info!("no jobs to run, skipping scheduled compaction task");
-                            } else {
-                                has_pending_scheduled_compaction_task = true;
-                                let jobs_len = jobs.len();
-                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
-                                for (idx, job) in jobs.into_iter().enumerate() {
-                                    // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
-                                    // until we do further refactors to allow directly call `compact_with_gc`.
-                                    let mut flags: EnumSet<CompactFlags> = EnumSet::default();
-                                    flags |= CompactFlags::EnhancedGcBottomMostCompaction;
-                                    if job.dry_run {
-                                        flags |= CompactFlags::DryRun;
-                                    }
-                                    let options = CompactOptions {
-                                        flags,
-                                        sub_compaction: false,
-                                        compact_key_range: Some(job.compact_key_range.into()),
-                                        compact_lsn_range: Some(job.compact_lsn_range.into()),
-                                        sub_compaction_max_job_size_mb: None,
-                                    };
-                                    tline_pending_tasks.push_back(if idx == jobs_len - 1 {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            // The last job in the queue sends the signal and releases the gc guard
-                                            result_tx: next_scheduled_compaction_task
-                                                .result_tx
-                                                .take(),
-                                            gc_block: next_scheduled_compaction_task
-                                                .gc_block
-                                                .take(),
-                                        }
-                                    } else {
-                                        ScheduledCompactionTask {
-                                            options,
-                                            result_tx: None,
-                                            gc_block: None,
-                                        }
-                                    });
-                                }
-                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
-                            }
-                        } else {
-                            let _ = timeline
-                                .compact_with_options(
-                                    cancel,
-                                    next_scheduled_compaction_task.options,
-                                    ctx,
-                                )
-                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
-                                .await?;
-                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
-                                // TODO: we can send compaction statistics in the future
-                                tx.send(()).ok();
-                            }
-                        }
+                    if let Some(queue) = queue {
+                        let has_pending_tasks = queue
+                            .iteration(cancel, ctx, &self.gc_block, timeline)
+                            .await?;
+                        Some(has_pending_tasks)
+                    } else {
+                        Some(false)
                    }
-                    Some(has_pending_scheduled_compaction_task)
                }
            } else {
                None
@@ -3126,34 +3035,32 @@ impl Tenant {
    }

    /// Cancel scheduled compaction tasks
-    pub(crate) fn cancel_scheduled_compaction(
-        &self,
-        timeline_id: TimelineId,
-    ) -> Vec<ScheduledCompactionTask> {
+    pub(crate) fn cancel_scheduled_compaction(&self, timeline_id: TimelineId) {
        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
-            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
-            current_tline_pending_tasks.into_iter().collect()
-        } else {
-            Vec::new()
+        if let Some(q) = guard.get_mut(&timeline_id) {
+            q.cancel_scheduled();
        }
    }

    pub(crate) fn get_scheduled_compaction_tasks(
        &self,
        timeline_id: TimelineId,
-    ) -> Vec<CompactOptions> {
-        use itertools::Itertools;
-        let guard = self.scheduled_compaction_tasks.lock().unwrap();
-        guard
-            .get(&timeline_id)
-            .map(|tline_pending_tasks| {
-                tline_pending_tasks
-                    .iter()
-                    .map(|x| x.options.clone())
-                    .collect_vec()
-            })
-            .unwrap_or_default()
+    ) -> Vec<CompactInfoResponse> {
+        let res = {
+            let guard = self.scheduled_compaction_tasks.lock().unwrap();
+            guard.get(&timeline_id).map(|q| q.remaining_jobs())
+        };
+        let Some((running, remaining)) = res else {
+            return Vec::new();
+        };
+        let mut result = Vec::new();
+        if let Some((id, running)) = running {
+            result.extend(running.into_compact_info_resp(id, true));
+        }
+        for (id, job) in remaining {
+            result.extend(job.into_compact_info_resp(id, false));
+        }
+        result
    }

    /// Schedule a compaction task for a timeline.
@@ -3162,20 +3069,12 @@ impl Tenant {
        timeline_id: TimelineId,
        options: CompactOptions,
    ) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
-        let gc_guard = match self.gc_block.start().await {
-            Ok(guard) => guard,
-            Err(e) => {
-                bail!("cannot run gc-compaction because gc is blocked: {}", e);
-            }
-        };
        let (tx, rx) = tokio::sync::oneshot::channel();
        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
-        let tline_pending_tasks = guard.entry(timeline_id).or_default();
-        tline_pending_tasks.push_back(ScheduledCompactionTask {
-            options,
-            result_tx: Some(tx),
-            gc_block: Some(gc_guard),
-        });
+        let q = guard
+            .entry(timeline_id)
+            .or_insert_with(|| Arc::new(GcCompactionQueue::new()));
+        q.schedule_manual_compaction(options, Some(tx));
        Ok(rx)
    }

@@ -5791,7 +5690,7 @@ mod tests {
    use bytes::{Bytes, BytesMut};
    use hex_literal::hex;
    use itertools::Itertools;
-    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
    use pageserver_api::keyspace::KeySpace;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
    use pageserver_api::value::Value;
@@ -7850,7 +7749,18 @@ mod tests {
        let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
        let base_key_child = Key::from_hex("620000000033333333444444445500000001").unwrap();
        let base_key_nonexist = Key::from_hex("620000000033333333444444445500000002").unwrap();
+        let base_key_overwrite = Key::from_hex("620000000033333333444444445500000003").unwrap();
+
+        let base_inherited_key = Key::from_hex("610000000033333333444444445500000000").unwrap();
+        let base_inherited_key_child =
+            Key::from_hex("610000000033333333444444445500000001").unwrap();
+        let base_inherited_key_nonexist =
+            Key::from_hex("610000000033333333444444445500000002").unwrap();
+        let base_inherited_key_overwrite =
+            Key::from_hex("610000000033333333444444445500000003").unwrap();
+
        assert_eq!(base_key.field1, AUX_KEY_PREFIX); // in case someone accidentally changed the prefix...
+        assert_eq!(base_inherited_key.field1, RELATION_SIZE_PREFIX);

        let tline = tenant
            .create_test_timeline_with_layers(
@@ -7859,7 +7769,18 @@ mod tests {
                DEFAULT_PG_VERSION,
                &ctx,
                Vec::new(), // delta layers
-                vec![(Lsn(0x20), vec![(base_key, test_img("metadata key 1"))])], // image layers
+                vec![(
+                    Lsn(0x20),
+                    vec![
+                        (base_inherited_key, test_img("metadata inherited key 1")),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 1a"),
+                        ),
+                        (base_key, test_img("metadata key 1")),
+                        (base_key_overwrite, test_img("metadata key overwrite 1b")),
+                    ],
+                )], // image layers
                Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
            )
            .await?;
@@ -7873,7 +7794,18 @@ mod tests {
                Vec::new(), // delta layers
                vec![(
                    Lsn(0x30),
-                    vec![(base_key_child, test_img("metadata key 2"))],
+                    vec![
+                        (
+                            base_inherited_key_child,
+                            test_img("metadata inherited key 2"),
+                        ),
+                        (
+                            base_inherited_key_overwrite,
+                            test_img("metadata key overwrite 2a"),
+                        ),
+                        (base_key_child, test_img("metadata key 2")),
+                        (base_key_overwrite, test_img("metadata key overwrite 2b")),
+                    ],
                )], // image layers
                Lsn(0x30),
            )
@@ -7895,6 +7827,26 @@ mod tests {
            get_vectored_impl_wrapper(&tline, base_key_nonexist, lsn, &ctx).await?,
            None
        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_child, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&tline, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 1a"))
+        );

        // test vectored get on child timeline
        assert_eq!(
@@ -7909,6 +7861,82 @@ mod tests {
            get_vectored_impl_wrapper(&child, base_key_nonexist, lsn, &ctx).await?,
            None
        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 1"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_child, lsn, &ctx).await?,
+            Some(test_img("metadata inherited key 2"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_nonexist, lsn, &ctx).await?,
+            None
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2b"))
+        );
+        assert_eq!(
+            get_vectored_impl_wrapper(&child, base_inherited_key_overwrite, lsn, &ctx).await?,
+            Some(test_img("metadata key overwrite 2a"))
+        );
+
+        // test vectored scan on parent timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = tline
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 1a")
+                ),
+                (base_key, test_img("metadata key 1")),
+                (base_key_overwrite, test_img("metadata key overwrite 1b")),
+            ]
+        );
+
+        // test vectored scan on child timeline
+        let mut reconstruct_state = ValuesReconstructState::new();
+        let res = child
+            .get_vectored_impl(
+                KeySpace::single(Key::metadata_key_range()),
+                lsn,
+                &mut reconstruct_state,
+                &ctx,
+            )
+            .await?;
+
+        assert_eq!(
+            res.into_iter()
+                .map(|(k, v)| (k, v.unwrap()))
+                .collect::<Vec<_>>(),
+            vec![
+                (base_inherited_key, test_img("metadata inherited key 1")),
+                (
+                    base_inherited_key_child,
+                    test_img("metadata inherited key 2")
+                ),
+                (
+                    base_inherited_key_overwrite,
+                    test_img("metadata key overwrite 2a")
+                ),
+                (base_key_child, test_img("metadata key 2")),
+                (base_key_overwrite, test_img("metadata key overwrite 2b")),
+            ]
+        );

        Ok(())
    }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -11,7 +11,7 @@
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
-use pageserver_api::models::{self, TenantConfigPatch, ThrottleConfig};
+use pageserver_api::models::{self, TenantConfigPatch};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -597,7 +597,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                .map(humantime),
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
-            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
+            timeline_get_throttle: value.timeline_get_throttle,
            image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
            lsn_lease_length: value.lsn_lease_length.map(humantime),
            lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -84,17 +84,17 @@ impl Value {

    fn to_u64(self) -> u64 {
        let b = &self.0;
-        (b[0] as u64) << 32
-            | (b[1] as u64) << 24
-            | (b[2] as u64) << 16
-            | (b[3] as u64) << 8
+        ((b[0] as u64) << 32)
+            | ((b[1] as u64) << 24)
+            | ((b[2] as u64) << 16)
+            | ((b[3] as u64) << 8)
            | b[4] as u64
    }

    fn to_blknum(self) -> u32 {
        let b = &self.0;
        assert!(b[0] == 0x80);
-        (b[1] as u32) << 24 | (b[2] as u32) << 16 | (b[3] as u32) << 8 | b[4] as u32
+        ((b[1] as u32) << 24) | ((b[2] as u32) << 16) | ((b[3] as u32) << 8) | b[4] as u32
    }
 }

--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -320,7 +320,6 @@ impl TimelineMetadata {

    // Checksums make it awkward to build a valid instance by hand.  This helper
    // provides a TimelineMetadata with a valid checksum in its header.
-    #[cfg(test)]
    pub fn example() -> Self {
        let instance = Self::new(
            "0/16960E8".parse::<Lsn>().unwrap(),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -63,22 +63,18 @@
 //! The contract between client and its user is that the user is responsible of
 //! scheduling operations in an order that keeps the remote consistent as
 //! described above.
+//!
 //! From the user's perspective, the operations are executed sequentially.
 //! Internally, the client knows which operations can be performed in parallel,
 //! and which operations act like a "barrier" that require preceding operations
 //! to finish. The calling code just needs to call the schedule-functions in the
 //! correct order, and the client will parallelize the operations in a way that
-//! is safe.
-//!
-//! The caller should be careful with deletion, though. They should not delete
-//! local files that have been scheduled for upload but not yet finished uploading.
-//! Otherwise the upload will fail. To wait for an upload to finish, use
-//! the 'wait_completion' function (more on that later.)
+//! is safe. For more details, see `UploadOp::can_bypass`.
 //!
 //! All of this relies on the following invariants:
 //!
 //! - We rely on read-after write consistency in the remote storage.
-//! - Layer files are immutable
+//! - Layer files are immutable.
 //!
 //! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
 //! storage. Different tenants can be attached to different pageservers, but if the
@@ -429,8 +425,16 @@ impl RemoteTimelineClient {
    /// an index file upload, i.e., it's not empty.
    /// The given `index_part` must be the one on the remote.
    pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        info!(
            "initialized upload queue from remote index with {} layer files",
@@ -445,8 +449,16 @@ impl RemoteTimelineClient {
        &self,
        local_metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
+        // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
+        // certainly no point in starting more upload tasks than this.
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata)?;
+        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
@@ -462,9 +474,15 @@ impl RemoteTimelineClient {
        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
        ))?;
+        let inprogress_limit = self
+            .conf
+            .remote_storage_config
+            .as_ref()
+            .and_then(|r| r.concurrency_limit())
+            .unwrap_or(0);

        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, inprogress_limit)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        self.stop_impl(&mut upload_queue);

@@ -1855,57 +1873,17 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    ///
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
    ///
-    /// The caller needs to already hold the `upload_queue` lock.
+    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
+    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
+    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
    fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
-        while let Some(next_op) = upload_queue.queued_operations.front() {
-            // Can we run this task now?
-            let can_run_now = match next_op {
-                UploadOp::UploadLayer(..) => {
-                    // Can always be scheduled.
-                    true
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    // These can only be performed after all the preceding operations
-                    // have finished.
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-                UploadOp::Delete(..) => {
-                    // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
-                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
-                }
+        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
+            debug!("starting op: {next_op}");

-                UploadOp::Barrier(_) | UploadOp::Shutdown => {
-                    upload_queue.inprogress_tasks.is_empty()
-                }
-            };
-
-            // If we cannot launch this task, don't look any further.
-            //
-            // In some cases, we could let some non-frontmost tasks to "jump the queue" and launch
-            // them now, but we don't try to do that currently.  For example, if the frontmost task
-            // is an index-file upload that cannot proceed until preceding uploads have finished, we
-            // could still start layer uploads that were scheduled later.
-            if !can_run_now {
-                break;
-            }
-
-            if let UploadOp::Shutdown = next_op {
-                // leave the op in the queue but do not start more tasks; it will be dropped when
-                // the stop is called.
-                upload_queue.shutdown_ready.close();
-                break;
-            }
-
-            // We can launch this task. Remove it from the queue first.
-            let mut next_op = upload_queue.queued_operations.pop_front().unwrap();
-
-            debug!("starting op: {}", next_op);
-
-            // Update the counters and prepare
+            // Prepare upload.
            match &mut next_op {
                UploadOp::UploadLayer(layer, meta, mode) => {
                    if upload_queue
@@ -1916,18 +1894,14 @@ impl RemoteTimelineClient {
                    } else {
                        *mode = Some(OpType::MayReorder)
                    }
-                    upload_queue.num_inprogress_layer_uploads += 1;
-                }
-                UploadOp::UploadMetadata { .. } => {
-                    upload_queue.num_inprogress_metadata_uploads += 1;
                }
+                UploadOp::UploadMetadata { .. } => {}
                UploadOp::Delete(Delete { layers }) => {
                    for (name, meta) in layers {
                        upload_queue
                            .recently_deleted
                            .insert((name.clone(), meta.generation));
                    }
-                    upload_queue.num_inprogress_deletions += 1;
                }
                UploadOp::Barrier(sender) => {
                    sender.send_replace(());
@@ -1944,6 +1918,7 @@ impl RemoteTimelineClient {
            let task = Arc::new(UploadTask {
                task_id: upload_task_id,
                op: next_op,
+                coalesced_ops,
                retries: AtomicU32::new(0),
            });
            upload_queue
@@ -2027,6 +2002,8 @@ impl RemoteTimelineClient {

            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref layer, ref layer_metadata, mode) => {
+                    // TODO: check if this mechanism can be removed now that can_bypass() performs
+                    // conflict checks during scheduling.
                    if let Some(OpType::FlushDeletion) = mode {
                        if self.config.read().unwrap().block_deletions {
                            // Of course, this is not efficient... but usually the queue should be empty.
@@ -2249,13 +2226,8 @@ impl RemoteTimelineClient {
            upload_queue.inprogress_tasks.remove(&task.task_id);

            let lsn_update = match task.op {
-                UploadOp::UploadLayer(_, _, _) => {
-                    upload_queue.num_inprogress_layer_uploads -= 1;
-                    None
-                }
+                UploadOp::UploadLayer(_, _, _) => None,
                UploadOp::UploadMetadata { ref uploaded } => {
-                    upload_queue.num_inprogress_metadata_uploads -= 1;
-
                    // the task id is reused as a monotonicity check for storing the "clean"
                    // IndexPart.
                    let last_updater = upload_queue.clean.1;
@@ -2289,10 +2261,7 @@ impl RemoteTimelineClient {
                        None
                    }
                }
-                UploadOp::Delete(_) => {
-                    upload_queue.num_inprogress_deletions -= 1;
-                    None
-                }
+                UploadOp::Delete(_) => None,
                UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
            };

@@ -2317,6 +2286,9 @@ impl RemoteTimelineClient {
        }

        self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
    }

    fn metric_impl(
@@ -2409,6 +2381,7 @@ impl RemoteTimelineClient {
                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
                    let upload_queue_for_deletion = UploadQueueInitialized {
+                        inprogress_limit: initialized.inprogress_limit,
                        task_counter: 0,
                        dirty: initialized.dirty.clone(),
                        clean: initialized.clean.clone(),
@@ -2416,9 +2389,6 @@ impl RemoteTimelineClient {
                        visible_remote_consistent_lsn: initialized
                            .visible_remote_consistent_lsn
                            .clone(),
-                        num_inprogress_layer_uploads: 0,
-                        num_inprogress_metadata_uploads: 0,
-                        num_inprogress_deletions: 0,
                        inprogress_tasks: HashMap::default(),
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
@@ -2445,14 +2415,6 @@ impl RemoteTimelineClient {
                    }
                };

-                // consistency check
-                assert_eq!(
-                    qi.num_inprogress_layer_uploads
-                        + qi.num_inprogress_metadata_uploads
-                        + qi.num_inprogress_deletions,
-                    qi.inprogress_tasks.len()
-                );
-
                // We don't need to do anything here for in-progress tasks. They will finish
                // on their own, decrement the unfinished-task counter themselves, and observe
                // that the queue is Stopped.
@@ -2899,8 +2861,8 @@ mod tests {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
            assert!(upload_queue.queued_operations.is_empty());
-            assert!(upload_queue.inprogress_tasks.len() == 2);
-            assert!(upload_queue.num_inprogress_layer_uploads == 2);
+            assert_eq!(upload_queue.inprogress_tasks.len(), 2);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 2);

            // also check that `latest_file_changes` was updated
            assert!(upload_queue.latest_files_changes_since_metadata_upload_scheduled == 2);
@@ -2970,8 +2932,8 @@ mod tests {
            // Deletion schedules upload of the index file, and the file deletion itself
            assert_eq!(upload_queue.queued_operations.len(), 2);
            assert_eq!(upload_queue.inprogress_tasks.len(), 1);
-            assert_eq!(upload_queue.num_inprogress_layer_uploads, 1);
-            assert_eq!(upload_queue.num_inprogress_deletions, 0);
+            assert_eq!(upload_queue.num_inprogress_layer_uploads(), 1);
+            assert_eq!(upload_queue.num_inprogress_deletions(), 0);
            assert_eq!(
                upload_queue.latest_files_changes_since_metadata_upload_scheduled,
                0
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -104,7 +104,7 @@ impl IndexPart {

    pub const FILE_NAME: &'static str = "index_part.json";

-    pub(crate) fn empty(metadata: TimelineMetadata) -> Self {
+    pub fn empty(metadata: TimelineMetadata) -> Self {
        IndexPart {
            version: Self::LATEST_VERSION,
            layer_metadata: Default::default(),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;

 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -209,7 +209,7 @@ impl ValuesReconstructState {
            .keys
            .entry(*key)
            .or_insert(Ok(VectoredValueReconstructState::default()));
-        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
+        let is_sparse_key = key.is_sparse();
        if let Ok(state) = state {
            let key_done = match state.situation {
                ValueReconstructSituation::Complete => {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -112,8 +112,8 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
 ///
 /// Layout:
 /// - 1 bit: `will_init`
-/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
-/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
+/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`][]: `len`
+/// - [`MAX_SUPPORTED_POS_BITS`](IndexEntry::MAX_SUPPORTED_POS_BITS): `pos`
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct IndexEntry(u64);

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1812,7 +1812,7 @@ enum LayerKind {

 /// Guard for forcing a layer be resident while it exists.
 #[derive(Clone)]
-pub(crate) struct ResidentLayer {
+pub struct ResidentLayer {
    owner: Layer,
    downloaded: Arc<DownloadedLayer>,
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -27,7 +27,7 @@ use pageserver_api::{
    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
    key::{
        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
-        NON_INHERITED_SPARSE_RANGE,
+        SPARSE_RANGE,
    },
    keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
    models::{
@@ -3221,7 +3221,7 @@ impl Timeline {
            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
            // stalling compaction.
            keyspace.remove_overlapping_with(&KeySpace {
-                ranges: vec![NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE],
+                ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
            });

            // Keyspace is fully retrieved
@@ -3242,7 +3242,11 @@ impl Timeline {
            // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
            // space. If that's not the case, we had at least one key encounter a gap in the image layer
            // and stop the search as a result of that.
-            let removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            // Do not fire missing key error for sparse keys.
+            removed.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
            if !removed.is_empty() {
                break Some(removed);
            }
@@ -3257,6 +3261,21 @@ impl Timeline {
            timeline = &*timeline_owned;
        };

+        // Remove sparse keys from the keyspace so that it doesn't fire errors.
+        let missing_keyspace = if let Some(missing_keyspace) = missing_keyspace {
+            let mut missing_keyspace = missing_keyspace;
+            missing_keyspace.remove_overlapping_with(&KeySpace {
+                ranges: vec![SPARSE_RANGE],
+            });
+            if missing_keyspace.is_empty() {
+                None
+            } else {
+                Some(missing_keyspace)
+            }
+        } else {
+            None
+        };
+
        if let Some(missing_keyspace) = missing_keyspace {
            return Err(GetVectoredError::MissingKey(MissingKeyError {
                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
@@ -3762,36 +3781,35 @@ impl Timeline {
                return Err(FlushLayerError::Cancelled);
            }

-            let mut layers_to_upload = Vec::new();
-            layers_to_upload.extend(
-                self.create_image_layers(
-                    &rel_partition,
-                    self.initdb_lsn,
-                    ImageLayerCreationMode::Initial,
-                    ctx,
-                )
-                .await?,
-            );
+            // Ensure that we have a single call to `create_image_layers` with a combined dense keyspace.
+            // So that the key ranges don't overlap.
+            let mut partitions = KeyPartitioning::default();
+            partitions.parts.extend(rel_partition.parts);
            if !metadata_partition.parts.is_empty() {
                assert_eq!(
                    metadata_partition.parts.len(),
                    1,
                    "currently sparse keyspace should only contain a single metadata keyspace"
                );
-                layers_to_upload.extend(
-                    self.create_image_layers(
-                        // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
-                        // every single key within the keyspace, and therefore, it's safe to force converting it
-                        // into a dense keyspace before calling this function.
-                        &metadata_partition.into_dense(),
-                        self.initdb_lsn,
-                        ImageLayerCreationMode::Initial,
-                        ctx,
-                    )
-                    .await?,
-                );
+                // Safety: create_image_layers treat sparse keyspaces differently that it does not scan
+                // every single key within the keyspace, and therefore, it's safe to force converting it
+                // into a dense keyspace before calling this function.
+                partitions
+                    .parts
+                    .extend(metadata_partition.into_dense().parts);
            }

+            let mut layers_to_upload = Vec::new();
+            layers_to_upload.extend(
+                self.create_image_layers(
+                    &partitions,
+                    self.initdb_lsn,
+                    ImageLayerCreationMode::Initial,
+                    ctx,
+                )
+                .await?,
+            );
+
            (layers_to_upload, None)
        } else {
            // Normal case, write out a L0 delta layer file.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.

-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;

@@ -16,10 +16,12 @@ use super::{

 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
+use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
 use pageserver_api::keyspace::ShardedRange;
+use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
@@ -30,6 +32,7 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
 use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
+use crate::tenant::gc_block::GcBlock;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -63,16 +66,284 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;

-/// A scheduled compaction task.
-pub(crate) struct ScheduledCompactionTask {
-    /// It's unfortunate that we need to store a compact options struct here because the only outer
-    /// API we can call here is `compact_with_options` which does a few setup calls before starting the
-    /// actual compaction job... We should refactor this to store `GcCompactionJob` in the future.
-    pub options: CompactOptions,
-    /// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
-    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
-    /// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
-    pub gc_block: Option<gc_block::Guard>,
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub struct GcCompactionJobId(pub usize);
+
+impl std::fmt::Display for GcCompactionJobId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum GcCompactionQueueItem {
+    Manual(CompactOptions),
+    SubCompactionJob(CompactOptions),
+    #[allow(dead_code)]
+    UpdateL2Lsn(Lsn),
+    Notify(GcCompactionJobId),
+}
+
+impl GcCompactionQueueItem {
+    pub fn into_compact_info_resp(
+        self,
+        id: GcCompactionJobId,
+        running: bool,
+    ) -> Option<CompactInfoResponse> {
+        match self {
+            GcCompactionQueueItem::Manual(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::SubCompactionJob(options) => Some(CompactInfoResponse {
+                compact_key_range: options.compact_key_range,
+                compact_lsn_range: options.compact_lsn_range,
+                sub_compaction: options.sub_compaction,
+                running,
+                job_id: id.0,
+            }),
+            GcCompactionQueueItem::UpdateL2Lsn(_) => None,
+            GcCompactionQueueItem::Notify(_) => None,
+        }
+    }
+}
+
+struct GcCompactionQueueInner {
+    running: Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    notify: HashMap<GcCompactionJobId, tokio::sync::oneshot::Sender<()>>,
+    gc_guards: HashMap<GcCompactionJobId, gc_block::Guard>,
+    last_id: GcCompactionJobId,
+}
+
+impl GcCompactionQueueInner {
+    fn next_id(&mut self) -> GcCompactionJobId {
+        let id = self.last_id;
+        self.last_id = GcCompactionJobId(id.0 + 1);
+        id
+    }
+}
+
+/// A structure to store gc_compaction jobs.
+pub struct GcCompactionQueue {
+    /// All items in the queue, and the currently-running job.
+    inner: std::sync::Mutex<GcCompactionQueueInner>,
+    /// Ensure only one thread is consuming the queue.
+    consumer_lock: tokio::sync::Mutex<()>,
+}
+
+impl GcCompactionQueue {
+    pub fn new() -> Self {
+        GcCompactionQueue {
+            inner: std::sync::Mutex::new(GcCompactionQueueInner {
+                running: None,
+                queued: VecDeque::new(),
+                notify: HashMap::new(),
+                gc_guards: HashMap::new(),
+                last_id: GcCompactionJobId(0),
+            }),
+            consumer_lock: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn cancel_scheduled(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.queued.clear();
+        guard.notify.clear();
+        guard.gc_guards.clear();
+    }
+
+    /// Schedule a manual compaction job.
+    pub fn schedule_manual_compaction(
+        &self,
+        options: CompactOptions,
+        notify: Option<tokio::sync::oneshot::Sender<()>>,
+    ) -> GcCompactionJobId {
+        let mut guard = self.inner.lock().unwrap();
+        let id = guard.next_id();
+        guard
+            .queued
+            .push_back((id, GcCompactionQueueItem::Manual(options)));
+        if let Some(notify) = notify {
+            guard.notify.insert(id, notify);
+        }
+        info!("scheduled compaction job id={}", id);
+        id
+    }
+
+    /// Trigger an auto compaction.
+    #[allow(dead_code)]
+    pub fn trigger_auto_compaction(&self, _: &Arc<Timeline>) {}
+
+    /// Notify the caller the job has finished and unblock GC.
+    fn notify_and_unblock(&self, id: GcCompactionJobId) {
+        info!("compaction job id={} finished", id);
+        let mut guard = self.inner.lock().unwrap();
+        if let Some(blocking) = guard.gc_guards.remove(&id) {
+            drop(blocking)
+        }
+        if let Some(tx) = guard.notify.remove(&id) {
+            let _ = tx.send(());
+        }
+    }
+
+    async fn handle_sub_compaction(
+        &self,
+        id: GcCompactionJobId,
+        options: CompactOptions,
+        timeline: &Arc<Timeline>,
+        gc_block: &GcBlock,
+    ) -> Result<(), CompactionError> {
+        info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+        let jobs: Vec<GcCompactJob> = timeline
+            .gc_compaction_split_jobs(
+                GcCompactJob::from_compact_options(options.clone()),
+                options.sub_compaction_max_job_size_mb,
+            )
+            .await
+            .map_err(CompactionError::Other)?;
+        if jobs.is_empty() {
+            info!("no jobs to run, skipping scheduled compaction task");
+            self.notify_and_unblock(id);
+        } else {
+            let gc_guard = match gc_block.start().await {
+                Ok(guard) => guard,
+                Err(e) => {
+                    return Err(CompactionError::Other(anyhow!(
+                        "cannot run gc-compaction because gc is blocked: {}",
+                        e
+                    )));
+                }
+            };
+
+            let jobs_len = jobs.len();
+            let mut pending_tasks = Vec::new();
+            for job in jobs {
+                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
+                // until we do further refactors to allow directly call `compact_with_gc`.
+                let mut flags: EnumSet<CompactFlags> = EnumSet::default();
+                flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                if job.dry_run {
+                    flags |= CompactFlags::DryRun;
+                }
+                let options = CompactOptions {
+                    flags,
+                    sub_compaction: false,
+                    compact_key_range: Some(job.compact_key_range.into()),
+                    compact_lsn_range: Some(job.compact_lsn_range.into()),
+                    sub_compaction_max_job_size_mb: None,
+                };
+                pending_tasks.push(GcCompactionQueueItem::SubCompactionJob(options));
+            }
+            pending_tasks.push(GcCompactionQueueItem::Notify(id));
+            {
+                let mut guard = self.inner.lock().unwrap();
+                guard.gc_guards.insert(id, gc_guard);
+                let mut tasks = Vec::new();
+                for task in pending_tasks {
+                    let id = guard.next_id();
+                    tasks.push((id, task));
+                }
+                tasks.reverse();
+                for item in tasks {
+                    guard.queued.push_front(item);
+                }
+            }
+            info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+        }
+        Ok(())
+    }
+
+    /// Take a job from the queue and process it. Returns if there are still pending tasks.
+    pub async fn iteration(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+        gc_block: &GcBlock,
+        timeline: &Arc<Timeline>,
+    ) -> Result<bool, CompactionError> {
+        let _one_op_at_a_time_guard = self.consumer_lock.lock().await;
+        let has_pending_tasks;
+        let (id, item) = {
+            let mut guard = self.inner.lock().unwrap();
+            let Some((id, item)) = guard.queued.pop_front() else {
+                return Ok(false);
+            };
+            guard.running = Some((id, item.clone()));
+            has_pending_tasks = !guard.queued.is_empty();
+            (id, item)
+        };
+
+        match item {
+            GcCompactionQueueItem::Manual(options) => {
+                if !options
+                    .flags
+                    .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                {
+                    warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", options);
+                } else if options.sub_compaction {
+                    self.handle_sub_compaction(id, options, timeline, gc_block)
+                        .await?;
+                } else {
+                    let gc_guard = match gc_block.start().await {
+                        Ok(guard) => guard,
+                        Err(e) => {
+                            return Err(CompactionError::Other(anyhow!(
+                                "cannot run gc-compaction because gc is blocked: {}",
+                                e
+                            )));
+                        }
+                    };
+                    {
+                        let mut guard = self.inner.lock().unwrap();
+                        guard.gc_guards.insert(id, gc_guard);
+                    }
+                    let _ = timeline
+                        .compact_with_options(cancel, options, ctx)
+                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                        .await?;
+                    self.notify_and_unblock(id);
+                }
+            }
+            GcCompactionQueueItem::SubCompactionJob(options) => {
+                let _ = timeline
+                    .compact_with_options(cancel, options, ctx)
+                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
+                    .await?;
+            }
+            GcCompactionQueueItem::Notify(id) => {
+                self.notify_and_unblock(id);
+            }
+            GcCompactionQueueItem::UpdateL2Lsn(_) => {
+                unreachable!()
+            }
+        }
+        {
+            let mut guard = self.inner.lock().unwrap();
+            guard.running = None;
+        }
+        Ok(has_pending_tasks)
+    }
+
+    #[allow(clippy::type_complexity)]
+    pub fn remaining_jobs(
+        &self,
+    ) -> (
+        Option<(GcCompactionJobId, GcCompactionQueueItem)>,
+        VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
+    ) {
+        let guard = self.inner.lock().unwrap();
+        (guard.running.clone(), guard.queued.clone())
+    }
+
+    #[allow(dead_code)]
+    pub fn remaining_jobs_num(&self) -> usize {
+        let guard = self.inner.lock().unwrap();
+        guard.queued.len() + if guard.running.is_some() { 1 } else { 0 }
+    }
 }

 /// A job description for the gc-compaction job. This structure describes the rectangle range that the job will
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -403,7 +403,7 @@ pub(super) async fn handle_walreceiver_connection(
                // need to advance last record LSN on all shards. If we've not ingested the latest
                // record, then set the LSN of the modification past it. This way all shards
                // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn.map(Lsn::from) {
+                let needs_last_record_lsn_advance = match next_record_lsn {
                    Some(lsn) if lsn > modification.get_lsn() => {
                        modification.set_lsn(lsn).unwrap();
                        true
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -308,7 +308,7 @@ impl WalIngest {
            epoch -= 1;
        }

-        Ok((epoch as u64) << 32 | xid as u64)
+        Ok(((epoch as u64) << 32) | xid as u64)
    }

    async fn ingest_clear_vm_bits(