Expose timeline logical size as a prometheus metric.

Physical size was already exposed, and it'd be nice to show both
logical and physical size side by side in our graphana dashboards.
This commit is contained in:
Heikki Linnakangas
2022-08-19 22:21:33 +03:00
parent 84cd40b416
commit d48177d0d8
2 changed files with 79 additions and 12 deletions

View File

@@ -139,6 +139,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static CURRENT_LOGICAL_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_current_logical_size",
"Current logical size grouped by timeline",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
@@ -234,6 +243,8 @@ struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub current_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: IntGauge,
}
impl TimelineMetrics {
@@ -271,6 +282,9 @@ impl TimelineMetrics {
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
TimelineMetrics {
reconstruct_time_histo,
@@ -283,6 +297,7 @@ impl TimelineMetrics {
last_record_gauge,
wait_lsn_time_histo,
current_physical_size_gauge,
current_logical_size_gauge,
}
}
}
@@ -391,6 +406,11 @@ pub struct Timeline {
/// get_current_logical_size() will clamp the returned value to zero if it's
/// negative, and log an error. Could set it permanently to zero or some
/// special value to indicate "broken" instead, but this will do for now.
///
/// Note that we also expose a copy of this value as a prometheus metric,
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
/// and `set_current_logical_size` functions to modify this, they will
/// also keep the prometheus metric in sync.
current_logical_size: AtomicI64,
/// Information about the last processed message by the WAL receiver,
@@ -827,8 +847,7 @@ impl Timeline {
//
// Logical size 0 means that it was not initialized, so don't believe that.
if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn {
self.current_logical_size
.store(ancestor_logical_size as i64, AtomicOrdering::SeqCst);
self.set_current_logical_size(ancestor_logical_size);
debug!(
"logical size copied from ancestor: {}",
ancestor_logical_size
@@ -842,8 +861,7 @@ impl Timeline {
// Have to calculate it the hard way
let last_lsn = self.get_last_record_lsn();
let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?;
self.current_logical_size
.store(logical_size as i64, AtomicOrdering::SeqCst);
self.set_current_logical_size(logical_size);
debug!("calculated logical size the hard way: {}", logical_size);
timer.stop_and_record();
@@ -867,6 +885,34 @@ impl Timeline {
}
}
/// Update current logical size, adding `delta' to the old value.
fn update_current_logical_size(&self, delta: i64) {
let new_size = self
.current_logical_size
.fetch_add(delta, AtomicOrdering::SeqCst);
// Also set the value in the prometheus gauge. Note that
// there is a race condition here: if this is is called by two
// threads concurrently, the prometheus gauge might be set to
// one value while current_logical_size is set to the
// other. Currently, only initialization and the WAL receiver
// updates the logical size, and they don't run concurrently,
// so it cannot happen. And even if it did, it wouldn't be
// very serious, the metrics would just be slightly off until
// the next update.
self.metrics.current_logical_size_gauge.set(new_size);
}
/// Set current logical size.
fn set_current_logical_size(&self, new_size: u64) {
self.current_logical_size
.store(new_size as i64, AtomicOrdering::SeqCst);
// Also set the value in the prometheus gauge. Same race condition
// here as in `update_current_logical_size`.
self.metrics.current_logical_size_gauge.set(new_size as i64);
}
///
/// Get a handle to a Layer for reading.
///
@@ -2261,9 +2307,7 @@ impl<'a> TimelineWriter<'a> {
}
pub fn update_current_logical_size(&self, delta: i64) {
self.tl
.current_logical_size
.fetch_add(delta, AtomicOrdering::SeqCst);
self.tl.update_current_logical_size(delta)
}
}

View File

@@ -1,4 +1,5 @@
from contextlib import closing
import math
import random
from uuid import UUID
import re
@@ -278,11 +279,13 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
assert_physical_size(env, env.initial_tenant, new_timeline_id)
def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
# The timeline logical and physical sizes are also exposed as prometheus metrics.
# Test the metrics.
def test_timeline_size_metrics(neon_simple_env: NeonEnv):
env = neon_simple_env
new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric')
pg = env.postgres.create_start("test_timeline_physical_size_metric")
new_timeline_id = env.neon_cli.create_branch('test_timeline_size_metrics')
pg = env.postgres.create_start("test_timeline_size_metrics")
pg.safe_psql_many([
"CREATE TABLE foo (t text)",
@@ -301,12 +304,32 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
metrics,
re.MULTILINE)
assert matches
# assert that the metric matches the actual physical size on disk
tl_physical_size_metric = int(matches.group(1))
# assert that the physical size metric matches the actual physical size on disk
timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)
# Check that the logical size metric is sane, and matches
matches = re.search(
f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$',
metrics,
re.MULTILINE)
assert matches
tl_logical_size_metric = int(matches.group(1))
# An empty database is around 8 MB. There at least 3 databases, 'postgres',
# 'template0', 'template1'. So the total size should be about 32 MB. This isn't
# very accurate and can change with different PostgreSQL versions, so allow a
# couple of MB of slack.
assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024)
# The sum of the sizes of all databases, as seen by pg_database_size(), should also
# be close. Again allow some slack, the logical size metric includes some things like
# the SLRUs that are not included in pg_database_size().
dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0]
assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
def test_tenant_physical_size(neon_simple_env: NeonEnv):
random.seed(100)