Expose timeline logical size as a prometheus metric.

Physical size was already exposed, and it'd be nice to show both logical and physical size side by side in our graphana dashboards.
2026-01-08 22:12:56 +00:00 · 2022-08-19 22:21:33 +03:00
parent 84cd40b416
commit d48177d0d8
2 changed files with 79 additions and 12 deletions
--- a/pageserver/src/layered_repository/timeline.rs
+++ b/pageserver/src/layered_repository/timeline.rs
@@ -139,6 +139,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static CURRENT_LOGICAL_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_current_logical_size",
+        "Current logical size grouped by timeline",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
 static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
@@ -234,6 +243,8 @@ struct TimelineMetrics {
    pub last_record_gauge: IntGauge,
    pub wait_lsn_time_histo: Histogram,
    pub current_physical_size_gauge: UIntGauge,
+    /// copy of LayeredTimeline.current_logical_size
+    pub current_logical_size_gauge: IntGauge,
 }

 impl TimelineMetrics {
@@ -271,6 +282,9 @@ impl TimelineMetrics {
        let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();

        TimelineMetrics {
            reconstruct_time_histo,
@@ -283,6 +297,7 @@ impl TimelineMetrics {
            last_record_gauge,
            wait_lsn_time_histo,
            current_physical_size_gauge,
+            current_logical_size_gauge,
        }
    }
 }
@@ -391,6 +406,11 @@ pub struct Timeline {
    /// get_current_logical_size() will clamp the returned value to zero if it's
    /// negative, and log an error. Could set it permanently to zero or some
    /// special value to indicate "broken" instead, but this will do for now.
+    ///
+    /// Note that we also expose a copy of this value as a prometheus metric,
+    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
+    /// and `set_current_logical_size` functions to modify this, they will
+    /// also keep the prometheus metric in sync.
    current_logical_size: AtomicI64,

    /// Information about the last processed message by the WAL receiver,
@@ -827,8 +847,7 @@ impl Timeline {
            //
            // Logical size 0 means that it was not initialized, so don't believe that.
            if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn {
-                self.current_logical_size
-                    .store(ancestor_logical_size as i64, AtomicOrdering::SeqCst);
+                self.set_current_logical_size(ancestor_logical_size);
                debug!(
                    "logical size copied from ancestor: {}",
                    ancestor_logical_size
@@ -842,8 +861,7 @@ impl Timeline {
        // Have to calculate it the hard way
        let last_lsn = self.get_last_record_lsn();
        let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?;
-        self.current_logical_size
-            .store(logical_size as i64, AtomicOrdering::SeqCst);
+        self.set_current_logical_size(logical_size);
        debug!("calculated logical size the hard way: {}", logical_size);

        timer.stop_and_record();
@@ -867,6 +885,34 @@ impl Timeline {
        }
    }

+    /// Update current logical size, adding `delta' to the old value.
+    fn update_current_logical_size(&self, delta: i64) {
+        let new_size = self
+            .current_logical_size
+            .fetch_add(delta, AtomicOrdering::SeqCst);
+
+        // Also set the value in the prometheus gauge. Note that
+        // there is a race condition here: if this is is called by two
+        // threads concurrently, the prometheus gauge might be set to
+        // one value while current_logical_size is set to the
+        // other. Currently, only initialization and the WAL receiver
+        // updates the logical size, and they don't run concurrently,
+        // so it cannot happen. And even if it did, it wouldn't be
+        // very serious, the metrics would just be slightly off until
+        // the next update.
+        self.metrics.current_logical_size_gauge.set(new_size);
+    }
+
+    /// Set current logical size.
+    fn set_current_logical_size(&self, new_size: u64) {
+        self.current_logical_size
+            .store(new_size as i64, AtomicOrdering::SeqCst);
+
+        // Also set the value in the prometheus gauge. Same race condition
+        // here as in `update_current_logical_size`.
+        self.metrics.current_logical_size_gauge.set(new_size as i64);
+    }
+
    ///
    /// Get a handle to a Layer for reading.
    ///
@@ -2261,9 +2307,7 @@ impl<'a> TimelineWriter<'a> {
    }

    pub fn update_current_logical_size(&self, delta: i64) {
-        self.tl
-            .current_logical_size
-            .fetch_add(delta, AtomicOrdering::SeqCst);
+        self.tl.update_current_logical_size(delta)
    }
 }

--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -1,4 +1,5 @@
 from contextlib import closing
+import math
 import random
 from uuid import UUID
 import re
@@ -278,11 +279,13 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


-def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
+# The timeline logical and physical sizes are also exposed as prometheus metrics.
+# Test the metrics.
+def test_timeline_size_metrics(neon_simple_env: NeonEnv):
    env = neon_simple_env

-    new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric')
-    pg = env.postgres.create_start("test_timeline_physical_size_metric")
+    new_timeline_id = env.neon_cli.create_branch('test_timeline_size_metrics')
+    pg = env.postgres.create_start("test_timeline_size_metrics")

    pg.safe_psql_many([
        "CREATE TABLE foo (t text)",
@@ -301,12 +304,32 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
        metrics,
        re.MULTILINE)
    assert matches
-
-    # assert that the metric matches the actual physical size on disk
    tl_physical_size_metric = int(matches.group(1))
+
+    # assert that the physical size metric matches the actual physical size on disk
    timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
    assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)

+    # Check that the logical size metric is sane, and matches
+    matches = re.search(
+        f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$',
+        metrics,
+        re.MULTILINE)
+    assert matches
+    tl_logical_size_metric = int(matches.group(1))
+
+    # An empty database is around 8 MB. There at least 3 databases, 'postgres',
+    # 'template0', 'template1'. So the total size should be about 32 MB. This isn't
+    # very accurate and can change with different PostgreSQL versions, so allow a
+    # couple of MB of slack.
+    assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024)
+
+    # The sum of the sizes of all databases, as seen by pg_database_size(), should also
+    # be close. Again allow some slack, the logical size metric includes some things like
+    # the SLRUs that are not included in pg_database_size().
+    dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0]
+    assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
+

 def test_tenant_physical_size(neon_simple_env: NeonEnv):
    random.seed(100)