Add timleline_logical_size metric.

Send this metric only when it is fully calculated. Make consumption metrics more stable: - Send per-timeline metrics only for active timelines. - Adjust test assertions to make test_metric_collection test more stable.
2026-01-09 14:32:57 +00:00 · 2022-12-27 19:45:09 +02:00
parent 890ff3803e
commit 8ff7bc5df1
5 changed files with 55 additions and 21 deletions
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -94,6 +94,9 @@ pub enum ConsumptionMetricKind {
    /// Size of the remote storage (S3) directory.
    /// This is an absolute, per-tenant metric.
    RemoteStorageSize,
+    /// Logical size of the data in the timeline
+    /// This is an absolute, per-timeline metric
+    TimelineLogicalSize,
 }

 impl FromStr for ConsumptionMetricKind {
@@ -105,6 +108,7 @@ impl FromStr for ConsumptionMetricKind {
            "synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
            "resident_size" => Ok(Self::ResidentSize),
            "remote_storage_size" => Ok(Self::RemoteStorageSize),
+            "timeline_logical_size" => Ok(Self::TimelineLogicalSize),
            _ => anyhow::bail!("invalid value \"{s}\" for metric type"),
        }
    }
@@ -117,6 +121,7 @@ impl fmt::Display for ConsumptionMetricKind {
            ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
            ConsumptionMetricKind::ResidentSize => "resident_size",
            ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
+            ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
        })
    }
 }
@@ -191,23 +196,35 @@ pub async fn collect_metrics_task(

        // iterate through list of timelines in tenant
        for timeline in tenant.list_timelines().iter() {
-            let timeline_written_size = u64::from(timeline.get_last_record_lsn());
+            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-            current_metrics.push((
-                ConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: Some(timeline.timeline_id),
-                    metric: ConsumptionMetricKind::WrittenSize,
-                },
-                timeline_written_size,
-            ));
+                current_metrics.push((
+                    ConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: ConsumptionMetricKind::WrittenSize,
+                    },
+                    timeline_written_size,
+                ));
+
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                // Only send timeline logical size when it is fully calculated.
+                if is_exact {
+                    current_metrics.push((
+                        ConsumptionMetricsKey {
+                            tenant_id,
+                            timeline_id: Some(timeline.timeline_id),
+                            metric: ConsumptionMetricKind::TimelineLogicalSize,
+                        },
+                        timeline_logical_size,
+                    ));
+                }
+            }

            let timeline_resident_size = timeline.get_resident_physical_size();
            tenant_resident_size += timeline_resident_size;
-
-            debug!(
-                "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
-                tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
        }

        let tenant_remote_size = tenant.get_remote_size().await?;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -120,7 +120,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
        lsn @ Lsn(_) => Some(lsn),
    };
    let current_logical_size = match timeline.get_current_logical_size() {
-        Ok(size) => Some(size),
+        Ok((size, _)) => Some(size),
        Err(err) => {
            error!("Timeline info creation failed to get current logical size: {err:?}");
            None
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -752,18 +752,22 @@ impl Timeline {
    ///
    /// The size could be lagging behind the actual number, in case
    /// the initial size calculation has not been run (gets triggered on the first size access).
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
        let current_size = self.current_logical_size.current_size()?;
        debug!("Current size: {current_size:?}");

+        let mut is_exact = true;
        let size = current_size.size();
        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
            (current_size, self.current_logical_size.initial_part_end)
        {
+            is_exact = false;
            self.try_spawn_size_init_task(init_lsn);
        }

-        Ok(size)
+        Ok((size, is_exact))
    }

    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -335,10 +335,11 @@ pub async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
+            let (timeline_logical_size, _) = timeline
+                .get_current_logical_size()
+                .context("Status update creation failed to get current logical size")?;
            let status_update = ReplicationFeedback {
-                current_timeline_size: timeline
-                    .get_current_logical_size()
-                    .context("Status update creation failed to get current logical size")?,
+                current_timeline_size: timeline_logical_size,
                ps_writelsn: write_lsn,
                ps_flushlsn: flush_lsn,
                ps_applylsn: apply_lsn,
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -42,16 +42,28 @@ def metrics_handler(request: Request) -> Response:
        # >= 0 check here is to avoid race condition when we receive metrics before
        # remote_uploaded is updated
        "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
+        # logical size may lag behind the actual size, so allow 0 here
+        "timeline_logical_size": lambda value: value >= 0,
    }

+    events_received = 0
    for event in events:
-        assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
+        check = checks.get(event["metric"])
+        # calm down mypy
+        if check is not None:
+            assert check(event["value"]), f"{event['metric']} isn't valid"
+            events_received += 1

    global first_request
    # check that all checks were sent
    # but only on the first request, because we don't send non-changed metrics
    if first_request:
-        assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
+        # we may receive more metrics than we check,
+        # because there are two timelines
+        # and we may receive per-timeline metrics from both
+        # if the test was slow enough for these metrics to be collected
+        # -1 because that is ok to not receive timeline_logical_size
+        assert events_received >= len(checks) - 1
        first_request = False

    global num_metrics_received