Add timleline_logical_size metric.

Send this metric only when it is fully calculated.

Make consumption metrics more stable:
- Send per-timeline metrics only for active timelines.
- Adjust test assertions to make test_metric_collection test more stable.
This commit is contained in:
Anastasia Lubennikova
2022-12-27 19:45:09 +02:00
parent 890ff3803e
commit 8ff7bc5df1
5 changed files with 55 additions and 21 deletions

View File

@@ -94,6 +94,9 @@ pub enum ConsumptionMetricKind {
/// Size of the remote storage (S3) directory.
/// This is an absolute, per-tenant metric.
RemoteStorageSize,
/// Logical size of the data in the timeline
/// This is an absolute, per-timeline metric
TimelineLogicalSize,
}
impl FromStr for ConsumptionMetricKind {
@@ -105,6 +108,7 @@ impl FromStr for ConsumptionMetricKind {
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
"resident_size" => Ok(Self::ResidentSize),
"remote_storage_size" => Ok(Self::RemoteStorageSize),
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
}
}
@@ -117,6 +121,7 @@ impl fmt::Display for ConsumptionMetricKind {
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
ConsumptionMetricKind::ResidentSize => "resident_size",
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
})
}
}
@@ -191,23 +196,35 @@ pub async fn collect_metrics_task(
// iterate through list of timelines in tenant
for timeline in tenant.list_timelines().iter() {
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
// collect per-timeline metrics only for active timelines
if timeline.is_active() {
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::WrittenSize,
},
timeline_written_size,
));
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::WrittenSize,
},
timeline_written_size,
));
let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
// Only send timeline logical size when it is fully calculated.
if is_exact {
current_metrics.push((
ConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: ConsumptionMetricKind::TimelineLogicalSize,
},
timeline_logical_size,
));
}
}
let timeline_resident_size = timeline.get_resident_physical_size();
tenant_resident_size += timeline_resident_size;
debug!(
"per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)",
tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size)
}
let tenant_remote_size = tenant.get_remote_size().await?;

View File

@@ -120,7 +120,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
lsn @ Lsn(_) => Some(lsn),
};
let current_logical_size = match timeline.get_current_logical_size() {
Ok(size) => Some(size),
Ok((size, _)) => Some(size),
Err(err) => {
error!("Timeline info creation failed to get current logical size: {err:?}");
None

View File

@@ -752,18 +752,22 @@ impl Timeline {
///
/// The size could be lagging behind the actual number, in case
/// the initial size calculation has not been run (gets triggered on the first size access).
pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<u64> {
///
/// return size and boolean flag that shows if the size is exact
pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
let current_size = self.current_logical_size.current_size()?;
debug!("Current size: {current_size:?}");
let mut is_exact = true;
let size = current_size.size();
if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
(current_size, self.current_logical_size.initial_part_end)
{
is_exact = false;
self.try_spawn_size_init_task(init_lsn);
}
Ok(size)
Ok((size, is_exact))
}
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in

View File

@@ -335,10 +335,11 @@ pub async fn handle_walreceiver_connection(
// Send the replication feedback message.
// Regular standby_status_update fields are put into this message.
let (timeline_logical_size, _) = timeline
.get_current_logical_size()
.context("Status update creation failed to get current logical size")?;
let status_update = ReplicationFeedback {
current_timeline_size: timeline
.get_current_logical_size()
.context("Status update creation failed to get current logical size")?,
current_timeline_size: timeline_logical_size,
ps_writelsn: write_lsn,
ps_flushlsn: flush_lsn,
ps_applylsn: apply_lsn,

View File

@@ -42,16 +42,28 @@ def metrics_handler(request: Request) -> Response:
# >= 0 check here is to avoid race condition when we receive metrics before
# remote_uploaded is updated
"remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
# logical size may lag behind the actual size, so allow 0 here
"timeline_logical_size": lambda value: value >= 0,
}
events_received = 0
for event in events:
assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid"
check = checks.get(event["metric"])
# calm down mypy
if check is not None:
assert check(event["value"]), f"{event['metric']} isn't valid"
events_received += 1
global first_request
# check that all checks were sent
# but only on the first request, because we don't send non-changed metrics
if first_request:
assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received"
# we may receive more metrics than we check,
# because there are two timelines
# and we may receive per-timeline metrics from both
# if the test was slow enough for these metrics to be collected
# -1 because that is ok to not receive timeline_logical_size
assert events_received >= len(checks) - 1
first_request = False
global num_metrics_received