mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
Expose timeline logical size as a prometheus metric.
Physical size was already exposed, and it'd be nice to show both logical and physical size side by side in our graphana dashboards.
This commit is contained in:
@@ -139,6 +139,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CURRENT_LOGICAL_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_current_logical_size",
|
||||
"Current logical size grouped by timeline",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
@@ -234,6 +243,8 @@ struct TimelineMetrics {
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub current_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: IntGauge,
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
@@ -271,6 +282,9 @@ impl TimelineMetrics {
|
||||
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
reconstruct_time_histo,
|
||||
@@ -283,6 +297,7 @@ impl TimelineMetrics {
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
current_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -391,6 +406,11 @@ pub struct Timeline {
|
||||
/// get_current_logical_size() will clamp the returned value to zero if it's
|
||||
/// negative, and log an error. Could set it permanently to zero or some
|
||||
/// special value to indicate "broken" instead, but this will do for now.
|
||||
///
|
||||
/// Note that we also expose a copy of this value as a prometheus metric,
|
||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||
/// and `set_current_logical_size` functions to modify this, they will
|
||||
/// also keep the prometheus metric in sync.
|
||||
current_logical_size: AtomicI64,
|
||||
|
||||
/// Information about the last processed message by the WAL receiver,
|
||||
@@ -827,8 +847,7 @@ impl Timeline {
|
||||
//
|
||||
// Logical size 0 means that it was not initialized, so don't believe that.
|
||||
if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn {
|
||||
self.current_logical_size
|
||||
.store(ancestor_logical_size as i64, AtomicOrdering::SeqCst);
|
||||
self.set_current_logical_size(ancestor_logical_size);
|
||||
debug!(
|
||||
"logical size copied from ancestor: {}",
|
||||
ancestor_logical_size
|
||||
@@ -842,8 +861,7 @@ impl Timeline {
|
||||
// Have to calculate it the hard way
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?;
|
||||
self.current_logical_size
|
||||
.store(logical_size as i64, AtomicOrdering::SeqCst);
|
||||
self.set_current_logical_size(logical_size);
|
||||
debug!("calculated logical size the hard way: {}", logical_size);
|
||||
|
||||
timer.stop_and_record();
|
||||
@@ -867,6 +885,34 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Update current logical size, adding `delta' to the old value.
|
||||
fn update_current_logical_size(&self, delta: i64) {
|
||||
let new_size = self
|
||||
.current_logical_size
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
|
||||
// Also set the value in the prometheus gauge. Note that
|
||||
// there is a race condition here: if this is is called by two
|
||||
// threads concurrently, the prometheus gauge might be set to
|
||||
// one value while current_logical_size is set to the
|
||||
// other. Currently, only initialization and the WAL receiver
|
||||
// updates the logical size, and they don't run concurrently,
|
||||
// so it cannot happen. And even if it did, it wouldn't be
|
||||
// very serious, the metrics would just be slightly off until
|
||||
// the next update.
|
||||
self.metrics.current_logical_size_gauge.set(new_size);
|
||||
}
|
||||
|
||||
/// Set current logical size.
|
||||
fn set_current_logical_size(&self, new_size: u64) {
|
||||
self.current_logical_size
|
||||
.store(new_size as i64, AtomicOrdering::SeqCst);
|
||||
|
||||
// Also set the value in the prometheus gauge. Same race condition
|
||||
// here as in `update_current_logical_size`.
|
||||
self.metrics.current_logical_size_gauge.set(new_size as i64);
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to a Layer for reading.
|
||||
///
|
||||
@@ -2261,9 +2307,7 @@ impl<'a> TimelineWriter<'a> {
|
||||
}
|
||||
|
||||
pub fn update_current_logical_size(&self, delta: i64) {
|
||||
self.tl
|
||||
.current_logical_size
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
self.tl.update_current_logical_size(delta)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from contextlib import closing
|
||||
import math
|
||||
import random
|
||||
from uuid import UUID
|
||||
import re
|
||||
@@ -278,11 +279,13 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
|
||||
assert_physical_size(env, env.initial_tenant, new_timeline_id)
|
||||
|
||||
|
||||
def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
|
||||
# The timeline logical and physical sizes are also exposed as prometheus metrics.
|
||||
# Test the metrics.
|
||||
def test_timeline_size_metrics(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
new_timeline_id = env.neon_cli.create_branch('test_timeline_physical_size_metric')
|
||||
pg = env.postgres.create_start("test_timeline_physical_size_metric")
|
||||
new_timeline_id = env.neon_cli.create_branch('test_timeline_size_metrics')
|
||||
pg = env.postgres.create_start("test_timeline_size_metrics")
|
||||
|
||||
pg.safe_psql_many([
|
||||
"CREATE TABLE foo (t text)",
|
||||
@@ -301,12 +304,32 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
|
||||
metrics,
|
||||
re.MULTILINE)
|
||||
assert matches
|
||||
|
||||
# assert that the metric matches the actual physical size on disk
|
||||
tl_physical_size_metric = int(matches.group(1))
|
||||
|
||||
# assert that the physical size metric matches the actual physical size on disk
|
||||
timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
|
||||
assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)
|
||||
|
||||
# Check that the logical size metric is sane, and matches
|
||||
matches = re.search(
|
||||
f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant.hex}",timeline_id="{new_timeline_id.hex}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE)
|
||||
assert matches
|
||||
tl_logical_size_metric = int(matches.group(1))
|
||||
|
||||
# An empty database is around 8 MB. There at least 3 databases, 'postgres',
|
||||
# 'template0', 'template1'. So the total size should be about 32 MB. This isn't
|
||||
# very accurate and can change with different PostgreSQL versions, so allow a
|
||||
# couple of MB of slack.
|
||||
assert math.isclose(tl_logical_size_metric, 32 * 1024 * 1024, abs_tol=2 * 1024 * 1024)
|
||||
|
||||
# The sum of the sizes of all databases, as seen by pg_database_size(), should also
|
||||
# be close. Again allow some slack, the logical size metric includes some things like
|
||||
# the SLRUs that are not included in pg_database_size().
|
||||
dbsize_sum = pg.safe_psql("select sum(pg_database_size(oid)) from pg_database")[0][0]
|
||||
assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
|
||||
|
||||
|
||||
def test_tenant_physical_size(neon_simple_env: NeonEnv):
|
||||
random.seed(100)
|
||||
|
||||
Reference in New Issue
Block a user