Return metrics from configure request

fix(consumption): rename _size_ => _data_ (#4866 )
I failed at renaming the metric middle part while managing to do a great job with the suffix. Fix the middle part as well.
2026-05-13 03:00:37 +00:00 · 2023-08-03 11:30:52 -04:00 · 2023-08-01 19:18:25 +03:00 · 2023-08-01 18:28:18 +03:00 · 2023-08-01 18:14:16 +03:00 · 2023-08-01 15:30:36 +03:00
19 changed files with 505 additions and 387 deletions
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -30,6 +30,7 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
        status: state.status,
        last_active: state.last_active,
        error: state.error.clone(),
+        metrics: state.metrics.clone(),
    }
 }

--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -21,6 +21,7 @@ pub struct ComputeStatusResponse {
    #[serde(serialize_with = "rfc3339_serialize")]
    pub last_active: Option<DateTime<Utc>>,
    pub error: Option<String>,
+    pub metrics: ComputeMetrics,
 }

 #[derive(Deserialize, Serialize)]
@@ -66,7 +67,7 @@ where
 }

 /// Response of the /metrics.json API
-#[derive(Clone, Debug, Default, Serialize)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
 pub struct ComputeMetrics {
    pub wait_for_spec_ms: u64,
    pub sync_safekeepers_ms: u64,
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -57,7 +57,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: String) -> String {
+pub fn idempotency_key(node_id: &str) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -71,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T> {
-    pub events: &'a [T],
+pub struct EventChunk<'a, T: Clone> {
+    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -68,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
            true
        },
    )?;
-    let mut cursor = BlockCursor::new(&file);
+    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos())?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -14,14 +14,16 @@ use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::time::Duration;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Clone, Copy)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -32,13 +34,13 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct PageserverConsumptionMetricsKey {
-    pub tenant_id: TenantId,
-    pub timeline_id: Option<TimelineId>,
-    pub metric: &'static str,
+struct MetricsKey {
+    tenant_id: TenantId,
+    timeline_id: Option<TimelineId>,
+    metric: &'static str,
 }

-impl PageserverConsumptionMetricsKey {
+impl MetricsKey {
    const fn absolute_values(self) -> AbsoluteValueFactory {
        AbsoluteValueFactory(self)
    }
@@ -48,18 +50,17 @@ impl PageserverConsumptionMetricsKey {
 }

 /// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
+struct AbsoluteValueFactory(MetricsKey);

 impl AbsoluteValueFactory {
-    fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
+    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
        let key = self.0;
-        let time = Utc::now();
        (key, (EventType::Absolute { time }, val))
    }
 }

 /// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
+struct IncrementalValueFactory(MetricsKey);

 impl IncrementalValueFactory {
    #[allow(clippy::wrong_self_convention)]
@@ -68,7 +69,7 @@ impl IncrementalValueFactory {
        prev_end: DateTime<Utc>,
        up_to: DateTime<Utc>,
        val: u64,
-    ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
+    ) -> (MetricsKey, (EventType, u64)) {
        let key = self.0;
        // cannot assert prev_end < up_to because these are realtime clock based
        (
@@ -83,15 +84,18 @@ impl IncrementalValueFactory {
        )
    }

-    fn key(&self) -> &PageserverConsumptionMetricsKey {
+    fn key(&self) -> &MetricsKey {
        &self.0
    }
 }

-// the static part of a PageserverConsumptionMetricsKey
-impl PageserverConsumptionMetricsKey {
+// the static part of a MetricsKey
+impl MetricsKey {
+    /// Absolute value of [`Timeline::get_last_record_lsn`].
+    ///
+    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
+        MetricsKey {
            tenant_id,
            timeline_id: Some(timeline_id),
            metric: "written_size",
@@ -99,25 +103,31 @@ impl PageserverConsumptionMetricsKey {
        .absolute_values()
    }

-    /// Values will be the difference of the latest written_size (last_record_lsn) to what we
-    /// previously sent.
+    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
+    /// previously sent, starting from the previously sent incremental time range ending at the
+    /// latest absolute measurement.
    const fn written_size_delta(
        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> IncrementalValueFactory {
-        PageserverConsumptionMetricsKey {
+        MetricsKey {
            tenant_id,
            timeline_id: Some(timeline_id),
-            metric: "written_size_bytes_delta",
+            // the name here is correctly about data not size, because that is what is wanted by
+            // downstream pipeline
+            metric: "written_data_bytes_delta",
        }
        .incremental_values()
    }

+    /// Exact [`Timeline::get_current_logical_size`].
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
    const fn timeline_logical_size(
        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
+        MetricsKey {
            tenant_id,
            timeline_id: Some(timeline_id),
            metric: "timeline_logical_size",
@@ -125,8 +135,11 @@ impl PageserverConsumptionMetricsKey {
        .absolute_values()
    }

+    /// [`Tenant::remote_size`]
+    ///
+    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
+        MetricsKey {
            tenant_id,
            timeline_id: None,
            metric: "remote_storage_size",
@@ -134,8 +147,11 @@ impl PageserverConsumptionMetricsKey {
        .absolute_values()
    }

+    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
+    ///
+    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
+        MetricsKey {
            tenant_id,
            timeline_id: None,
            metric: "resident_size",
@@ -143,8 +159,11 @@ impl PageserverConsumptionMetricsKey {
        .absolute_values()
    }

+    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    ///
+    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        PageserverConsumptionMetricsKey {
+        MetricsKey {
            tenant_id,
            timeline_id: None,
            metric: "synthetic_storage_size",
@@ -228,15 +247,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-pub async fn collect_metrics_iteration(
+async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
+    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
+    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -270,130 +289,48 @@ pub async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines().iter() {
+        for timeline in tenant.list_timelines() {
            // collect per-timeline metrics only for active timelines
-            if timeline.is_active() {
-                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-                let (key, written_size_now) =
-                    PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
-                        .now(timeline_written_size);
+            let timeline_id = timeline.timeline_id;

-                // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-                // features might change this.
-
-                let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
-                    tenant_id,
-                    timeline.timeline_id,
-                );
-
-                // use this when available, because in a stream of incremental values, it will be
-                // accurate where as when last_record_lsn stops moving, we will only cache the last
-                // one of those.
-                let last_stop_time =
-                    cached_metrics
-                        .get(written_size_delta_key.key())
-                        .map(|(until, _val)| {
-                            until
-                                .incremental_timerange()
-                                .expect("never create EventType::Absolute for written_size_delta")
-                                .end
-                        });
-
-                // by default, use the last sent written_size as the basis for
-                // calculating the delta. if we don't yet have one, use the load time value.
-                let prev = cached_metrics
-                    .get(&key)
-                    .map(|(prev_at, prev)| {
-                        // use the prev time from our last incremental update, or default to latest
-                        // absolute update on the first round.
-                        let prev_at = prev_at
-                            .absolute_time()
-                            .expect("never create EventType::Incremental for written_size");
-                        let prev_at = last_stop_time.unwrap_or(prev_at);
-                        (*prev_at, *prev)
-                    })
-                    .unwrap_or_else(|| {
-                        // if we don't have a previous point of comparison, compare to the load time
-                        // lsn.
-                        let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
-                        (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-                    });
-
-                // written_size_delta_bytes
-                current_metrics.extend(
-                    if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                        let up_to = written_size_now
-                            .0
-                            .absolute_time()
-                            .expect("never create EventType::Incremental for written_size");
-                        let key_value =
-                            written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                        Some(key_value)
-                    } else {
-                        None
-                    },
-                );
-
-                // written_size
-                current_metrics.push((key, written_size_now));
-
-                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
-                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
-                    // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push(
-                            PageserverConsumptionMetricsKey::timeline_logical_size(
-                                tenant_id,
-                                timeline.timeline_id,
-                            )
-                            .now(size),
-                        );
-                    }
-                    Ok((_, _)) => {}
-                    Err(err) => {
-                        error!(
-                            "failed to get current logical size for timeline {}: {err:?}",
-                            timeline.timeline_id
-                        );
-                        continue;
-                    }
-                };
+            match TimelineSnapshot::collect(&timeline, ctx) {
+                Ok(Some(snap)) => {
+                    snap.to_metrics(
+                        tenant_id,
+                        timeline_id,
+                        Utc::now(),
+                        &mut current_metrics,
+                        cached_metrics,
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    error!(
+                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
+                        timeline.timeline_id
+                    );
+                    continue;
+                }
            }

-            let timeline_resident_size = timeline.get_resident_physical_size();
-            tenant_resident_size += timeline_resident_size;
+            tenant_resident_size += timeline.resident_physical_size();
        }

-        match tenant.get_remote_size().await {
-            Ok(tenant_remote_size) => {
-                current_metrics.push(
-                    PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
-                        .now(tenant_remote_size),
-                );
-            }
-            Err(err) => {
-                error!(
-                    "failed to get remote size for tenant {}: {err:?}",
-                    tenant_id
-                );
-            }
-        }
+        current_metrics
+            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));

-        current_metrics.push(
-            PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
-        );
+        current_metrics
+            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
+        let synthetic_size = tenant.cached_synthetic_size();

-        if tenant_synthetic_size != 0 {
+        if synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push(
-                PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
-                    .now(tenant_synthetic_size),
-            );
+            current_metrics
+                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
        }
    }

@@ -425,6 +362,8 @@ pub async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

+    let node_id = node_id.to_string();
+
    for chunk in chunks {
        chunk_to_send.clear();

@@ -432,7 +371,7 @@ pub async fn collect_metrics_iteration(
        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
            kind: *when,
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(node_id.to_string()),
+            idempotency_key: idempotency_key(&node_id),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -440,17 +379,14 @@ pub async fn collect_metrics_iteration(
            },
        }));

-        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
-            events: &chunk_to_send,
-        })
-        .expect("PageserverConsumptionMetric should not fail serialization");
-
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&chunk_json)
+                .json(&EventChunk {
+                    events: (&chunk_to_send).into(),
+                })
                .send()
                .await;

@@ -486,6 +422,130 @@ pub async fn collect_metrics_iteration(
    }
 }

+/// Internal type to make timeline metric production testable.
+///
+/// As this value type contains all of the information needed from a timeline to produce the
+/// metrics, it can easily be created with different values in test.
+struct TimelineSnapshot {
+    loaded_at: (Lsn, SystemTime),
+    last_record_lsn: Lsn,
+    current_exact_logical_size: Option<u64>,
+}
+
+impl TimelineSnapshot {
+    /// Collect the metrics from an actual timeline.
+    ///
+    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
+    ///
+    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
+    fn collect(
+        t: &Arc<crate::tenant::Timeline>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Self>> {
+        use anyhow::Context;
+
+        if !t.is_active() {
+            // no collection for broken or stopping needed, we will still keep the cached values
+            // though at the caller.
+            Ok(None)
+        } else {
+            let loaded_at = t.loaded_at;
+            let last_record_lsn = t.get_last_record_lsn();
+
+            let current_exact_logical_size = {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
+                    // Only send timeline logical size when it is fully calculated.
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
+                }
+            };
+
+            Ok(Some(TimelineSnapshot {
+                loaded_at,
+                last_record_lsn,
+                current_exact_logical_size,
+            }))
+        }
+    }
+
+    /// Produce the timeline consumption metrics into the `metrics` argument.
+    fn to_metrics(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        now: DateTime<Utc>,
+        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
+        cache: &HashMap<MetricsKey, (EventType, u64)>,
+    ) {
+        let timeline_written_size = u64::from(self.last_record_lsn);
+
+        let (key, written_size_now) =
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
+
+        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+        // features might change this.
+
+        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
+
+        // use this when available, because in a stream of incremental values, it will be
+        // accurate where as when last_record_lsn stops moving, we will only cache the last
+        // one of those.
+        let last_stop_time = cache
+            .get(written_size_delta_key.key())
+            .map(|(until, _val)| {
+                until
+                    .incremental_timerange()
+                    .expect("never create EventType::Absolute for written_size_delta")
+                    .end
+            });
+
+        // by default, use the last sent written_size as the basis for
+        // calculating the delta. if we don't yet have one, use the load time value.
+        let prev = cache
+            .get(&key)
+            .map(|(prev_at, prev)| {
+                // use the prev time from our last incremental update, or default to latest
+                // absolute update on the first round.
+                let prev_at = prev_at
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let prev_at = last_stop_time.unwrap_or(prev_at);
+                (*prev_at, *prev)
+            })
+            .unwrap_or_else(|| {
+                // if we don't have a previous point of comparison, compare to the load time
+                // lsn.
+                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
+                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+            });
+
+        // written_size_bytes_delta
+        metrics.extend(
+            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                let up_to = written_size_now
+                    .0
+                    .absolute_time()
+                    .expect("never create EventType::Incremental for written_size");
+                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                Some(key_value)
+            } else {
+                None
+            },
+        );
+
+        // written_size
+        metrics.push((key, written_size_now));
+
+        if let Some(size) = self.current_exact_logical_size {
+            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
+        }
+    }
+}
+
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -500,7 +560,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-        tick_at = ticker.tick() => {
+            tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -536,3 +596,149 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use std::time::SystemTime;
+    use utils::{
+        id::{TenantId, TimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::consumption_metrics::MetricsKey;
+
+    use super::TimelineSnapshot;
+    use chrono::{DateTime, Utc};
+
+    #[test]
+    fn startup_collected_timeline_metrics_before_advancing() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::new();
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, SystemTime::now()),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        let now = DateTime::<Utc>::from(SystemTime::now());
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    snap.loaded_at.1.into(),
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_second_round() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id)
+                    .from_previous_up_to(before, now, 0),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    #[test]
+    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+
+        let [now, just_before, before, init] = time_backwards();
+
+        let now = DateTime::<Utc>::from(now);
+        let just_before = DateTime::<Utc>::from(just_before);
+        let before = DateTime::<Utc>::from(before);
+
+        let initdb_lsn = Lsn(0x10000);
+        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+        let mut metrics = Vec::new();
+        let cache = HashMap::from([
+            // at t=before was the last time the last_record_lsn changed
+            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+            // end time of this event is used for the next ones
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                before,
+                just_before,
+                0,
+            ),
+        ]);
+
+        let snap = TimelineSnapshot {
+            loaded_at: (disk_consistent_lsn, init),
+            last_record_lsn: disk_consistent_lsn,
+            current_exact_logical_size: Some(0x42000),
+        };
+
+        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+        assert_eq!(
+            metrics,
+            &[
+                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
+                    just_before,
+                    now,
+                    0
+                ),
+                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            ]
+        );
+    }
+
+    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
+        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
+        times[0] = std::time::SystemTime::now();
+        for behind in 1..N {
+            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
+        }
+
+        times
+    }
+}
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -644,20 +644,19 @@ impl Tenant {
        Ok(())
    }

-    /// get size of all remote timelines
+    /// Get sum of all remote timelines sizes
    ///
    /// This function relies on the index_part instead of listing the remote storage
-    ///
-    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
+    pub fn remote_size(&self) -> u64 {
        let mut size = 0;

-        for timeline in self.list_timelines().iter() {
+        for timeline in self.list_timelines() {
            if let Some(remote_client) = &timeline.remote_client {
                size += remote_client.get_remote_physical_size();
            }
        }

-        Ok(size)
+        size
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
@@ -2889,7 +2888,7 @@ impl Tenant {
            .set(size);
    }

-    pub fn get_cached_synthetic_size(&self) -> u64 {
+    pub fn cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }
 }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,7 +21,7 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
@@ -29,7 +29,7 @@ where
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
    pub fn read_blob_into_buf(
-        &mut self,
+        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -80,7 +80,7 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -420,7 +420,7 @@ mod tests {
            blobs.push((pos, data));
        }

-        let mut cursor = BlockCursor::new(&file);
+        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -9,7 +9,7 @@ mod remote_layer;

 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::repository::{Key, Value};
+use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
@@ -381,12 +381,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

-/// Returned by [`PersistentLayer::iter`]
-pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
-
-/// Returned by [`PersistentLayer::key_iter`]
-pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
-
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
@@ -427,15 +421,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
    // `None` for `RemoteLayer`.
    fn local_path(&self) -> Option<PathBuf>;

-    /// Iterate through all keys and values stored in the layer
-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
-
-    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
-    /// It is used only for compaction and so is currently implemented only for DeltaLayer
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        panic!("Not implemented")
-    }
-
    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -61,8 +61,8 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, PathOrConf, PersistentLayerDesc,
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
+    PersistentLayerDesc,
 };

 ///
@@ -189,7 +189,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<DeltaLayerInner>,
+    inner: OnceCell<Arc<DeltaLayerInner>>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -258,10 +258,10 @@ impl Layer for DeltaLayer {

        tree_reader.dump().await?;

-        let mut cursor = file.block_cursor();
+        let cursor = file.block_cursor();

        // A subroutine to dump a single blob
-        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
            let buf = cursor.read_blob(blob_ref.pos())?;
            let val = Value::des(&buf)?;
            let desc = match val {
@@ -343,7 +343,7 @@ impl Layer for DeltaLayer {
            })?;

            // Ok, 'offsets' now contains the offsets of all the entries we need to read
-            let mut cursor = file.block_cursor();
+            let cursor = file.block_cursor();
            let mut buf = Vec::new();
            for (entry_lsn, pos) in offsets {
                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
@@ -424,23 +424,6 @@ impl PersistentLayer for DeltaLayer {
        Some(self.path())
    }

-    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .context("load delta layer")?;
-        Ok(match DeltaValueIter::new(inner) {
-            Ok(iter) => Box::new(iter),
-            Err(err) => Box::new(std::iter::once(Err(err))),
-        })
-    }
-
-    fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
-        Ok(Box::new(
-            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
-        ))
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -510,7 +493,11 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
+    fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&Arc<DeltaLayerInner>> {
        self.access_stats
            .record_access(access_kind, ctx.task_kind());
        // Quick exit if already loaded
@@ -519,7 +506,7 @@ impl DeltaLayer {
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<DeltaLayerInner> {
+    fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

        let file = VirtualFile::open(&path)
@@ -554,11 +541,11 @@ impl DeltaLayer {

        debug!("loaded from {}", &path.display());

-        Ok(DeltaLayerInner {
+        Ok(Arc::new(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        })
+        }))
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -623,6 +610,24 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
+
+    /// Obtains all keys and value references stored in the layer
+    ///
+    /// The value can be obtained via the [`ValueRef::load`] function.
+    pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer")?;
+        DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
+    }
+
+    /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer keys")?;
+        inner.load_keys().context("Layer index is corrupted")
+    }
 }

 /// A builder object for constructing a new delta layer.
@@ -893,121 +898,41 @@ impl Drop for DeltaLayerWriter {
    }
 }

-///
-/// Iterator over all key-value pairse stored in a delta layer
-///
-/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
-/// That takes up quite a lot of memory. Should do this in a more streaming
-/// fashion.
-///
-struct DeltaValueIter<'a> {
-    all_offsets: Vec<(DeltaKey, BlobRef)>,
-    next_idx: usize,
-    reader: BlockCursor<Adapter<'a>>,
-}
-
-struct Adapter<'a>(&'a DeltaLayerInner);
-
-impl<'a> BlockReader for Adapter<'a> {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
-    }
-}
-
-impl<'a> Iterator for DeltaValueIter<'a> {
-    type Item = Result<(Key, Lsn, Value)>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        self.next_res().transpose()
-    }
-}
-
-impl<'a> DeltaValueIter<'a> {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+impl DeltaLayerInner {
+    fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+        let file = &this.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
+            this.index_start_blk,
+            this.index_root_blk,
            file,
        );

-        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
+        let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
            |key, value| {
-                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
+                let delta_key = DeltaKey::from_slice(key);
+                let val_ref = ValueRef {
+                    blob_ref: BlobRef(value),
+                    reader: BlockCursor::new(Adapter(this.clone())),
+                };
+                all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
                true
            },
        )?;

-        let iter = DeltaValueIter {
-            all_offsets,
-            next_idx: 0,
-            reader: BlockCursor::new(Adapter(inner)),
-        };
-
-        Ok(iter)
+        Ok(all_offsets)
    }
-
-    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
-        if self.next_idx < self.all_offsets.len() {
-            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];
-
-            let key = delta_key.key();
-            let lsn = delta_key.lsn();
-
-            let buf = self.reader.read_blob(blob_ref.pos())?;
-            let val = Value::des(&buf)?;
-            self.next_idx += 1;
-            Ok(Some((key, lsn, val)))
-        } else {
-            Ok(None)
-        }
-    }
-}
-///
-/// Iterator over all keys stored in a delta layer
-///
-/// FIXME: This creates a Vector to hold all keys.
-/// That takes up quite a lot of memory. Should do this in a more streaming
-/// fashion.
-///
-struct DeltaKeyIter {
-    all_keys: Vec<(DeltaKey, u64)>,
-    next_idx: usize,
-}
-
-impl Iterator for DeltaKeyIter {
-    type Item = (Key, Lsn, u64);
-
-    fn next(&mut self) -> Option<Self::Item> {
-        if self.next_idx < self.all_keys.len() {
-            let (delta_key, size) = &self.all_keys[self.next_idx];
-
-            let key = delta_key.key();
-            let lsn = delta_key.lsn();
-
-            self.next_idx += 1;
-            Some((key, lsn, *size))
-        } else {
-            None
-        }
-    }
-}
-
-impl<'a> DeltaKeyIter {
-    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
-        let file = &inner.file;
+    fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
+        let file = &self.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            inner.index_start_blk,
-            inner.index_root_blk,
+            self.index_start_blk,
+            self.index_root_blk,
            file,
        );

-        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
+        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
@@ -1015,46 +940,48 @@ impl<'a> DeltaKeyIter {
                let delta_key = DeltaKey::from_slice(key);
                let pos = BlobRef(value).pos();
                if let Some(last) = all_keys.last_mut() {
-                    if last.0.key() == delta_key.key() {
+                    if last.0 == delta_key.key() {
                        return true;
                    } else {
                        // subtract offset of new key BLOB and first blob of this key
                        // to get total size if values associated with this key
-                        let first_pos = last.1;
-                        last.1 = pos - first_pos;
+                        let first_pos = last.2;
+                        last.2 = pos - first_pos;
                    }
                }
-                all_keys.push((delta_key, pos));
+                all_keys.push((delta_key.key(), delta_key.lsn(), pos));
                true
            },
        )?;
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of layer
-            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
+            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
        }
-        let iter = DeltaKeyIter {
-            all_keys,
-            next_idx: 0,
-        };
-
-        Ok(iter)
+        Ok(all_keys)
    }
 }

-#[cfg(test)]
-mod test {
-    use super::DeltaKeyIter;
-    use super::DeltaLayer;
-    use super::DeltaValueIter;
+/// Reference to an on-disk value
+pub struct ValueRef {
+    blob_ref: BlobRef,
+    reader: BlockCursor<Adapter>,
+}

-    // We will soon need the iters to be send in the compaction code.
-    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
-    // Cf https://github.com/neondatabase/neon/issues/4471
-    #[test]
-    fn is_send() {
-        fn assert_send<T: Send>() {}
-        assert_send::<DeltaLayer>();
-        assert_send::<DeltaValueIter>();
-        assert_send::<DeltaKeyIter>();
+impl ValueRef {
+    /// Loads the value from disk
+    pub fn load(&self) -> Result<Value> {
+        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let val = Value::des(&buf)?;
+        Ok(val)
+    }
+}
+
+struct Adapter(Arc<DeltaLayerInner>);
+
+impl BlockReader for Adapter {
+    type BlockLease = PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        self.0.file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -57,9 +57,7 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{
-    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
-};
+use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};

 ///
 /// Header stored in the beginning of the file
@@ -255,10 +253,6 @@ impl PersistentLayer for ImageLayer {
        Some(self.path())
    }

-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        unimplemented!();
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -151,7 +151,7 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
@@ -196,7 +196,7 @@ impl Layer for InMemoryLayer {

        let inner = self.inner.read().unwrap();

-        let mut reader = inner.file.block_cursor();
+        let reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
@@ -354,7 +354,7 @@ impl InMemoryLayer {

        let mut buf = Vec::new();

-        let mut cursor = inner.file.block_cursor();
+        let cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -20,8 +20,8 @@ use utils::{

 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
-    LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -129,14 +129,6 @@ impl PersistentLayer for RemoteLayer {
        None
    }

-    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
-    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
-        bail!("cannot iterate a remote layer");
-    }
-
    fn delete_resident_layer_file(&self) -> Result<()> {
        bail!("remote layer has no layer file");
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -528,7 +528,7 @@ impl Timeline {
        size
    }

-    pub fn get_resident_physical_size(&self) -> u64 {
+    pub fn resident_physical_size(&self) -> u64 {
        self.metrics.resident_physical_size_gauge.get()
    }

@@ -3513,7 +3513,13 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
        for (next_key, _next_lsn, _size) in itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
        )? {
            if let Some(prev_key) = prev {
@@ -3549,25 +3555,31 @@ impl Timeline {
        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
        let all_values_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_val_refs(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
-                    if let Ok((a_key, a_lsn, _)) = a {
-                        if let Ok((b_key, b_lsn, _)) = b {
-                            (a_key, a_lsn) < (b_key, b_lsn)
-                        } else {
-                            false
-                        }
-                    } else {
-                        true
-                    }
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    (a_key, a_lsn) < (b_key, b_lsn)
                })
            },
        )?;

        // This iterator walks through all keys and is needed to calculate size used by each key
        let mut all_keys_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            deltas_to_compact.iter().map(|l| -> Result<_> {
+                Ok(l.clone()
+                    .downcast_delta_layer()
+                    .expect("delta layer")
+                    .load_keys(ctx)?
+                    .into_iter())
+            }),
            |iter_iter| {
                iter_iter.kmerge_by(|a, b| {
                    let (a_key, a_lsn, _) = a;
@@ -3629,8 +3641,8 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for x in all_values_iter {
-            let (key, lsn, value) = x?;
+        for (key, lsn, value_ref) in all_values_iter {
+            let value = value_ref.load()?;
            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
            // We need to check key boundaries once we reach next key or end of layer with the same key
            if !same_key || lsn == dup_end_lsn {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -308,8 +308,13 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let mut state = self.eviction_task_timeline_state.lock().await;
+
+        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
+        // more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
+        let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
+
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
                    .await;
@@ -332,7 +337,7 @@ impl Timeline {
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
                    .await;
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -11,7 +11,6 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

-///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
 /// so keep it in a named struct.
@@ -19,8 +18,7 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-///
-#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
 pub struct Ids {
    pub endpoint_id: String,
    pub branch_id: String,
@@ -149,7 +147,7 @@ async fn collect_metrics_iteration(
                    stop_time: *curr_time,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.to_owned()),
+                idempotency_key: idempotency_key(hostname),
                value,
                extra: Ids {
                    endpoint_id: curr_key.endpoint_id.clone(),
@@ -167,12 +165,11 @@ async fn collect_metrics_iteration(
    // Send metrics.
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
-        let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
-            .expect("ProxyConsumptionMetric should not fail serialization");
-
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&chunk_json)
+            .json(&EventChunk {
+                events: chunk.into(),
+            })
            .send()
            .await;

--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -197,10 +197,9 @@ def wait_timeline_detail_404(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,
+    iterations: int,
 ):
    last_exc = None
-    iterations = 10 if wait_longer else 2
    for _ in range(iterations):
        time.sleep(0.250)
        try:
@@ -220,8 +219,8 @@ def timeline_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    timeline_id: TimelineId,
-    wait_longer: bool = False,  # Use when running with RemoteStorageKind.REAL_S3
+    iterations: int = 20,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -229,6 +229,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

    ps_http.configure_failpoints((failpoint, "return"))

+    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
+
    # These failpoints are earlier than background task is spawned.
    # so they result in api request failure.
    if failpoint in (
@@ -245,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            tenant_id=env.initial_tenant,
            timeline_id=timeline_id,
            expected_state="Broken",
-            iterations=2,  # effectively try immediately and retry once in one second
+            iterations=iterations,
        )

        reason = timeline_info["state"]["Broken"]["reason"]
@@ -254,21 +256,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        # failpoint may not be the only error in the stack
        assert reason.endswith(f"failpoint: {failpoint}"), reason

-    wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
    if check is Check.RETRY_WITH_RESTART:
        env.pageserver.stop()
        env.pageserver.start()
+
+        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
+
        if failpoint == "timeline-delete-before-index-deleted-at":
            # We crashed before persisting this to remote storage, need to retry delete request
-
-            # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
-            wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
-
            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
        else:
            # Pageserver should've resumed deletion after restart.
            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+                ps_http, env.initial_tenant, timeline_id, iterations=iterations
            )
    elif check is Check.RETRY_WITHOUT_RESTART:
        # this should succeed
@@ -276,7 +276,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        ps_http.configure_failpoints((failpoint, "off"))

        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+            ps_http, env.initial_tenant, timeline_id, iterations=iterations
        )

    # Check remote is impty
@@ -569,7 +569,7 @@ def test_concurrent_timeline_delete_stuck_on(
        try:
            log.info("first call start")
            timeline_delete_wait_completed(
-                ps_http, env.initial_tenant, child_timeline_id, timeout=10
+                ps_http, env.initial_tenant, child_timeline_id, timeout=20
            )
            log.info("first call success")
            result_queue.put("success")
@@ -683,7 +683,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
    wait_until(50, 0.1, first_request_finished)

    # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2)


@pytest.mark.parametrize(
Author	SHA1	Message	Date
Bojan Serafimov	e88e565f46	Return metrics from configure request	2023-08-03 11:30:52 -04:00
Joonas Koivunen	04776ade6c	fix(consumption): rename _size_ => _data_ (#4866 ) I failed at renaming the metric middle part while managing to do a great job with the suffix. Fix the middle part as well.	2023-08-01 19:18:25 +03:00
Dmitry Rodionov	c3fe335eaf	wait for tenant to be active before polling for timeline absence (#4856 ) ## Problem https://neon-github-public-dev.s3.amazonaws.com/reports/main/5692829577/index.html#suites/f588e0a787c49e67b29490359c589fae/4c50937643d68a66 ## Summary of changes wait for tenant to be active after restart before polling for timeline absence	2023-08-01 18:28:18 +03:00
Joonas Koivunen	3a00a5deb2	refactor: tidy consumption metrics (#4860 ) Tidying up I've been wanting to do for some time. Follow-up to #4857.	2023-08-01 18:14:16 +03:00
Joonas Koivunen	78fa2b13e5	test: written_size_bytes_delta (#4857 ) Two stabs at this, by mocking a http receiver and the globals out (now reverted) and then by separating the timeline dependency and just testing what kind of events certain timelines produce. I think this pattern could work for some of our problems. Follow-up to #4822.	2023-08-01 15:30:36 +03:00
John Spray	7c076edeea	pageserver: tweak period of imitate_layer_accesses (#4859 ) ## Problem When the eviction threshold is an integer multiple of the eviction period, it is unreliable to skip imitating accesses based on whether the last imitation was more recent than the threshold. This is because as finite time passes between the time used for the periodic execution, and the 'now' time used for updating last_layer_access_imitation. When this is just a few milliseconds, and everything else is on-time, then a 5 second threshold with a 1 second period will end up entering its 5th iteration slightly _less than_ 5 second since last_layer_access_imitation, and thereby skipping instead of running the imitation. If a few milliseconds then pass before we check the access time of a file that _should_ have been bumped by the imitation pass, then we end up evicting something we shouldn't have evicted. ## Summary of changes We can make this race far less likely by using the threshold minus one interval as the period for re-executing the imitate_layer_accesses: that way we're not vulnerable to racing by just a few millis, and there would have to be a delay of the order `period` to cause us to wrongly evict a layer. This is not a complete solution: it would be good to revisit this and use a non-walltime mechanism for pinning these layers into local storage, rather than relying on bumping access times.	2023-08-01 13:17:49 +01:00
Arpad Müller	69528b7c30	Prepare k-merge in compaction for async I/O (#4836 ) ## Problem The k-merge in pageserver compaction currently relies on iterators over the keys and also over the values. This approach does not support async code because we are using iterators and those don't support async in general. Also, the k-merge implementation we use doesn't support async either. Instead, as we already load all the keys into memory, the plan is to just do the sorting in-memory for now, switch to async, and then once we want to support workloads that don't have all keys stored in memory, we can look into switching to a k-merge implementation that supports async instead. ## Summary of changes The core of this PR is the move from functions on the `PersistentLayer` trait to return custom iterator types to inherent functions on `DeltaLayer` that return buffers with all keys or value references. Value references are a type we created in this PR, containing a `BlobRef` as well as an `Arc` pointer to the `DeltaLayerInner`, so that we can lazily load the values during compaction. This preserves the property of the current code. This PR does not switch us to doing the k-merge via sort on slices, but with this PR, doing such a switch is relatively easy and only requires changes of the compaction code itself. Part of https://github.com/neondatabase/neon/issues/4743	2023-08-01 13:38:35 +02:00