diff --git a/pageserver/src/layered_repository/timeline.rs b/pageserver/src/layered_repository/timeline.rs index 8f3004af98..7bbde53dbd 100644 --- a/pageserver/src/layered_repository/timeline.rs +++ b/pageserver/src/layered_repository/timeline.rs @@ -139,6 +139,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static CURRENT_LOGICAL_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_current_logical_size", + "Current logical size grouped by timeline", + &["tenant_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage, // or in testing they estimate how much we would upload if we did. static NUM_PERSISTENT_FILES_CREATED: Lazy = Lazy::new(|| { @@ -234,6 +243,8 @@ struct TimelineMetrics { pub last_record_gauge: IntGauge, pub wait_lsn_time_histo: Histogram, pub current_physical_size_gauge: UIntGauge, + /// copy of LayeredTimeline.current_logical_size + pub current_logical_size_gauge: IntGauge, } impl TimelineMetrics { @@ -271,6 +282,9 @@ impl TimelineMetrics { let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); + let current_logical_size_gauge = CURRENT_LOGICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &timeline_id]) + .unwrap(); TimelineMetrics { reconstruct_time_histo, @@ -283,6 +297,7 @@ impl TimelineMetrics { last_record_gauge, wait_lsn_time_histo, current_physical_size_gauge, + current_logical_size_gauge, } } } @@ -391,6 +406,11 @@ pub struct Timeline { /// get_current_logical_size() will clamp the returned value to zero if it's /// negative, and log an error. Could set it permanently to zero or some /// special value to indicate "broken" instead, but this will do for now. + /// + /// Note that we also expose a copy of this value as a prometheus metric, + /// see `current_logical_size_gauge`. Use the `update_current_logical_size` + /// and `set_current_logical_size` functions to modify this, they will + /// also keep the prometheus metric in sync. current_logical_size: AtomicI64, /// Information about the last processed message by the WAL receiver, @@ -827,8 +847,7 @@ impl Timeline { // // Logical size 0 means that it was not initialized, so don't believe that. if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn { - self.current_logical_size - .store(ancestor_logical_size as i64, AtomicOrdering::SeqCst); + self.set_current_logical_size(ancestor_logical_size); debug!( "logical size copied from ancestor: {}", ancestor_logical_size @@ -842,8 +861,7 @@ impl Timeline { // Have to calculate it the hard way let last_lsn = self.get_last_record_lsn(); let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?; - self.current_logical_size - .store(logical_size as i64, AtomicOrdering::SeqCst); + self.set_current_logical_size(logical_size); debug!("calculated logical size the hard way: {}", logical_size); timer.stop_and_record(); @@ -867,6 +885,34 @@ impl Timeline { } } + /// Update current logical size, adding `delta' to the old value. + fn update_current_logical_size(&self, delta: i64) { + let new_size = self + .current_logical_size + .fetch_add(delta, AtomicOrdering::SeqCst); + + // Also set the value in the prometheus gauge. Note that + // there is a race condition here: if this is is called by two + // threads concurrently, the prometheus gauge might be set to + // one value while current_logical_size is set to the + // other. Currently, only initialization and the WAL receiver + // updates the logical size, and they don't run concurrently, + // so it cannot happen. And even if it did, it wouldn't be + // very serious, the metrics would just be slightly off until + // the next update. + self.metrics.current_logical_size_gauge.set(new_size); + } + + /// Set current logical size. + fn set_current_logical_size(&self, new_size: u64) { + self.current_logical_size + .store(new_size as i64, AtomicOrdering::SeqCst); + + // Also set the value in the prometheus gauge. Same race condition + // here as in `update_current_logical_size`. + self.metrics.current_logical_size_gauge.set(new_size as i64); + } + /// /// Get a handle to a Layer for reading. /// @@ -2261,9 +2307,7 @@ impl<'a> TimelineWriter<'a> { } pub fn update_current_logical_size(&self, delta: i64) { - self.tl - .current_logical_size - .fetch_add(delta, AtomicOrdering::SeqCst); + self.tl.update_current_logical_size(delta) } } diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index 6e1168e38f..4a9359cf43 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -1,4 +1,5 @@ from contextlib import closing +import math import random from uuid import UUID import re @@ -278,11 +279,13 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): assert_physical_size(env, env.initial_tenant, new_timeline_id) -def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): +# The timeline logical and physical sizes are also exposed as prometheus metrics. +# Test the metrics. +def test_timeline_size_metrics(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric') - pg = env.postgres.create_start("test_timeline_physical_size_metric") + new_timeline_id = env.neon_cli.create_branch('test_timeline_size_metrics') + pg = env.postgres.create_start("test_timeline_size_metrics") pg.safe_psql_many([ "CREATE TABLE foo (t text)", @@ -301,12 +304,32 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv): metrics, re.MULTILINE) assert matches - - # assert that the metric matches the actual physical size on disk tl_physical_size_metric = int(matches.group(1)) + + # assert that the physical size metric matches the actual physical size on disk timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id) assert tl_physical_size_metric == get_timeline_dir_size(timeline_path) + # Check that the logical size metric is sane, and matches + matches = re.search( + f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$', + metrics, + re.MULTILINE) + assert matches + tl_logical_size_metric = int(matches.group(1)) + + # An empty database is around 8 MB. There at least 3 databases, 'postgres', + # 'template0', 'template1'. So the total size should be about 32 MB. This isn't + # very accurate and can change with different PostgreSQL versions, so allow a + # couple of MB of slack. + assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024) + + # The sum of the sizes of all databases, as seen by pg_database_size(), should also + # be close. Again allow some slack, the logical size metric includes some things like + # the SLRUs that are not included in pg_database_size(). + dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0] + assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024) + def test_tenant_physical_size(neon_simple_env: NeonEnv): random.seed(100)