mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 14:32:57 +00:00
Add timleline_logical_size metric.
Send this metric only when it is fully calculated. Make consumption metrics more stable: - Send per-timeline metrics only for active timelines. - Adjust test assertions to make test_metric_collection test more stable.
This commit is contained in:
@@ -94,6 +94,9 @@ pub enum ConsumptionMetricKind {
|
||||
/// Size of the remote storage (S3) directory.
|
||||
/// This is an absolute, per-tenant metric.
|
||||
RemoteStorageSize,
|
||||
/// Logical size of the data in the timeline
|
||||
/// This is an absolute, per-timeline metric
|
||||
TimelineLogicalSize,
|
||||
}
|
||||
|
||||
impl FromStr for ConsumptionMetricKind {
|
||||
@@ -105,6 +108,7 @@ impl FromStr for ConsumptionMetricKind {
|
||||
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
|
||||
"resident_size" => Ok(Self::ResidentSize),
|
||||
"remote_storage_size" => Ok(Self::RemoteStorageSize),
|
||||
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
|
||||
}
|
||||
}
|
||||
@@ -117,6 +121,7 @@ impl fmt::Display for ConsumptionMetricKind {
|
||||
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
|
||||
ConsumptionMetricKind::ResidentSize => "resident_size",
|
||||
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
|
||||
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -191,23 +196,35 @@ pub async fn collect_metrics_task(
|
||||
|
||||
// iterate through list of timelines in tenant
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
// collect per-timeline metrics only for active timelines
|
||||
if timeline.is_active() {
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::WrittenSize,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::WrittenSize,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
|
||||
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
if is_exact {
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::TimelineLogicalSize,
|
||||
},
|
||||
timeline_logical_size,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let timeline_resident_size = timeline.get_resident_physical_size();
|
||||
tenant_resident_size += timeline_resident_size;
|
||||
|
||||
debug!(
|
||||
"per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
|
||||
tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
|
||||
}
|
||||
|
||||
let tenant_remote_size = tenant.get_remote_size().await?;
|
||||
|
||||
@@ -120,7 +120,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size = match timeline.get_current_logical_size() {
|
||||
Ok(size) => Some(size),
|
||||
Ok((size, _)) => Some(size),
|
||||
Err(err) => {
|
||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||
None
|
||||
|
||||
@@ -752,18 +752,22 @@ impl Timeline {
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
|
||||
///
|
||||
/// return size and boolean flag that shows if the size is exact
|
||||
pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
|
||||
let current_size = self.current_logical_size.current_size()?;
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
let mut is_exact = true;
|
||||
let size = current_size.size();
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
is_exact = false;
|
||||
self.try_spawn_size_init_task(init_lsn);
|
||||
}
|
||||
|
||||
Ok(size)
|
||||
Ok((size, is_exact))
|
||||
}
|
||||
|
||||
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
||||
|
||||
@@ -335,10 +335,11 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let (timeline_logical_size, _) = timeline
|
||||
.get_current_logical_size()
|
||||
.context("Status update creation failed to get current logical size")?;
|
||||
let status_update = ReplicationFeedback {
|
||||
current_timeline_size: timeline
|
||||
.get_current_logical_size()
|
||||
.context("Status update creation failed to get current logical size")?,
|
||||
current_timeline_size: timeline_logical_size,
|
||||
ps_writelsn: write_lsn,
|
||||
ps_flushlsn: flush_lsn,
|
||||
ps_applylsn: apply_lsn,
|
||||
|
||||
@@ -42,16 +42,28 @@ def metrics_handler(request: Request) -> Response:
|
||||
# >= 0 check here is to avoid race condition when we receive metrics before
|
||||
# remote_uploaded is updated
|
||||
"remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
|
||||
# logical size may lag behind the actual size, so allow 0 here
|
||||
"timeline_logical_size": lambda value: value >= 0,
|
||||
}
|
||||
|
||||
events_received = 0
|
||||
for event in events:
|
||||
assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
|
||||
check = checks.get(event["metric"])
|
||||
# calm down mypy
|
||||
if check is not None:
|
||||
assert check(event["value"]), f"{event['metric']} isn't valid"
|
||||
events_received += 1
|
||||
|
||||
global first_request
|
||||
# check that all checks were sent
|
||||
# but only on the first request, because we don't send non-changed metrics
|
||||
if first_request:
|
||||
assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
|
||||
# we may receive more metrics than we check,
|
||||
# because there are two timelines
|
||||
# and we may receive per-timeline metrics from both
|
||||
# if the test was slow enough for these metrics to be collected
|
||||
# -1 because that is ok to not receive timeline_logical_size
|
||||
assert events_received >= len(checks) - 1
|
||||
first_request = False
|
||||
|
||||
global num_metrics_received
|
||||
|
||||
Reference in New Issue
Block a user