consumption_metrics: fix periodicness behavior & reporting (#5625)

Before this PR, the ticker was running at default miss behavior `Delay`. For example, here is the startup output with 25k tenants: ``` 2023-10-19T09:57:21.682466Z INFO synthetic_size_worker: starting calculate_synthetic_size_worker 2023-10-19T10:50:44.678202Z WARN synthetic_size_worker: task iteration took longer than the configured period elapsed=3202.995707156s period=10m task=ConsumptionMetricsSyntheticSizeWorker 2023-10-19T10:52:17.408056Z WARN synthetic_size_worker: task iteration took longer than the configured period elapsed=2695.72556035s period=10m task=ConsumptionMetricsSyntheticSizeWorker ``` The first message's `elapsed` value is correct. It matches the delta between the log line timestamps. The second one is logged ca 1.5min after, though, but reports a much larger `elapsed` than 1.5min. This PR fixes the behavior by copying what `eviction_task.rs` does.
2026-01-10 15:02:56 +00:00 · 2023-10-23 16:31:38 +02:00
parent 94b4e76e13
commit c6ca1d76d2
1 changed files with 26 additions and 19 deletions
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -11,6 +11,7 @@ use reqwest::Url;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
+use tokio::time::Instant;
 use tracing::*;
 use utils::id::NodeId;

@@ -88,22 +89,12 @@ pub async fn collect_metrics(

    let node_id = node_id.to_string();

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
    loop {
-        let tick_at = tokio::select! {
-            _ = cancel.cancelled() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        // these are point in time, with variable "now"
        let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;

-        if metrics.is_empty() {
-            continue;
-        }
-
        let metrics = Arc::new(metrics);

        // why not race cancellation here? because we are one of the last tasks, and if we are
@@ -142,10 +133,19 @@ pub async fn collect_metrics(
        let (_, _) = tokio::join!(flush, upload);

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            metric_collection_interval,
            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + metric_collection_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }

@@ -244,16 +244,14 @@ async fn calculate_synthetic_size_worker(
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
+    scopeguard::defer! {
+        info!("calculate_synthetic_size_worker stopped");
+    };

-    // reminder: ticker is ready immediatedly
-    let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

    loop {
-        let tick_at = tokio::select! {
-            _ = task_mgr::shutdown_watcher() => return Ok(()),
-            tick_at = ticker.tick() => tick_at,
-        };
+        let started_at = Instant::now();

        let tenants = match mgr::list_tenants().await {
            Ok(tenants) => tenants,
@@ -281,9 +279,18 @@ async fn calculate_synthetic_size_worker(
        }

        crate::tenant::tasks::warn_when_period_overrun(
-            tick_at.elapsed(),
+            started_at.elapsed(),
            synthetic_size_calculation_interval,
            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
        );
+
+        let res = tokio::time::timeout_at(
+            started_at + synthetic_size_calculation_interval,
+            task_mgr::shutdown_token().cancelled(),
+        )
+        .await;
+        if res.is_ok() {
+            return Ok(());
+        }
    }
 }