From 8ff7bc5df1b7644825c5379474b7715f3bb2ab21 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 27 Dec 2022 19:45:09 +0200 Subject: [PATCH] Add timleline_logical_size metric. Send this metric only when it is fully calculated. Make consumption metrics more stable: - Send per-timeline metrics only for active timelines. - Adjust test assertions to make test_metric_collection test more stable. --- pageserver/src/consumption_metrics.rs | 43 +++++++++++++------ pageserver/src/http/routes.rs | 2 +- pageserver/src/tenant/timeline.rs | 8 +++- .../src/walreceiver/walreceiver_connection.rs | 7 +-- test_runner/regress/test_metric_collection.py | 16 ++++++- 5 files changed, 55 insertions(+), 21 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 0d96eb431d..c411a9e025 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -94,6 +94,9 @@ pub enum ConsumptionMetricKind { /// Size of the remote storage (S3) directory. /// This is an absolute, per-tenant metric. RemoteStorageSize, + /// Logical size of the data in the timeline + /// This is an absolute, per-timeline metric + TimelineLogicalSize, } impl FromStr for ConsumptionMetricKind { @@ -105,6 +108,7 @@ impl FromStr for ConsumptionMetricKind { "synthetic_storage_size" => Ok(Self::SyntheticStorageSize), "resident_size" => Ok(Self::ResidentSize), "remote_storage_size" => Ok(Self::RemoteStorageSize), + "timeline_logical_size" => Ok(Self::TimelineLogicalSize), _ => anyhow::bail!("invalid value \"{s}\" for metric type"), } } @@ -117,6 +121,7 @@ impl fmt::Display for ConsumptionMetricKind { ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size", ConsumptionMetricKind::ResidentSize => "resident_size", ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size", + ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size", }) } } @@ -191,23 +196,35 @@ pub async fn collect_metrics_task( // iterate through list of timelines in tenant for timeline in tenant.list_timelines().iter() { - let timeline_written_size = u64::from(timeline.get_last_record_lsn()); + // collect per-timeline metrics only for active timelines + if timeline.is_active() { + let timeline_written_size = u64::from(timeline.get_last_record_lsn()); - current_metrics.push(( - ConsumptionMetricsKey { - tenant_id, - timeline_id: Some(timeline.timeline_id), - metric: ConsumptionMetricKind::WrittenSize, - }, - timeline_written_size, - )); + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: ConsumptionMetricKind::WrittenSize, + }, + timeline_written_size, + )); + + let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?; + // Only send timeline logical size when it is fully calculated. + if is_exact { + current_metrics.push(( + ConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline.timeline_id), + metric: ConsumptionMetricKind::TimelineLogicalSize, + }, + timeline_logical_size, + )); + } + } let timeline_resident_size = timeline.get_resident_physical_size(); tenant_resident_size += timeline_resident_size; - - debug!( - "per-timeline current metrics for tenant: {}: timeline {} resident_size={} last_record_lsn {} (as bytes)", - tenant_id, timeline.timeline_id, timeline_resident_size, timeline_written_size) } let tenant_remote_size = tenant.get_remote_size().await?; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 66a1607801..4f4c397abe 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -120,7 +120,7 @@ fn build_timeline_info_common(timeline: &Arc) -> anyhow::Result Some(lsn), }; let current_logical_size = match timeline.get_current_logical_size() { - Ok(size) => Some(size), + Ok((size, _)) => Some(size), Err(err) => { error!("Timeline info creation failed to get current logical size: {err:?}"); None diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index df02f24239..2c22c6694d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -752,18 +752,22 @@ impl Timeline { /// /// The size could be lagging behind the actual number, in case /// the initial size calculation has not been run (gets triggered on the first size access). - pub fn get_current_logical_size(self: &Arc) -> anyhow::Result { + /// + /// return size and boolean flag that shows if the size is exact + pub fn get_current_logical_size(self: &Arc) -> anyhow::Result<(u64, bool)> { let current_size = self.current_logical_size.current_size()?; debug!("Current size: {current_size:?}"); + let mut is_exact = true; let size = current_size.size(); if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) = (current_size, self.current_logical_size.initial_part_end) { + is_exact = false; self.try_spawn_size_init_task(init_lsn); } - Ok(size) + Ok((size, is_exact)) } /// Check if more than 'checkpoint_distance' of WAL has been accumulated in diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/walreceiver/walreceiver_connection.rs index a98126e683..3753807327 100644 --- a/pageserver/src/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/walreceiver/walreceiver_connection.rs @@ -335,10 +335,11 @@ pub async fn handle_walreceiver_connection( // Send the replication feedback message. // Regular standby_status_update fields are put into this message. + let (timeline_logical_size, _) = timeline + .get_current_logical_size() + .context("Status update creation failed to get current logical size")?; let status_update = ReplicationFeedback { - current_timeline_size: timeline - .get_current_logical_size() - .context("Status update creation failed to get current logical size")?, + current_timeline_size: timeline_logical_size, ps_writelsn: write_lsn, ps_flushlsn: flush_lsn, ps_applylsn: apply_lsn, diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index a3b3609153..ac9f163801 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -42,16 +42,28 @@ def metrics_handler(request: Request) -> Response: # >= 0 check here is to avoid race condition when we receive metrics before # remote_uploaded is updated "remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0, + # logical size may lag behind the actual size, so allow 0 here + "timeline_logical_size": lambda value: value >= 0, } + events_received = 0 for event in events: - assert checks.pop(event["metric"])(event["value"]), f"{event['metric']} isn't valid" + check = checks.get(event["metric"]) + # calm down mypy + if check is not None: + assert check(event["value"]), f"{event['metric']} isn't valid" + events_received += 1 global first_request # check that all checks were sent # but only on the first request, because we don't send non-changed metrics if first_request: - assert not checks, f"{' '.join(checks.keys())} wasn't/weren't received" + # we may receive more metrics than we check, + # because there are two timelines + # and we may receive per-timeline metrics from both + # if the test was slow enough for these metrics to be collected + # -1 because that is ok to not receive timeline_logical_size + assert events_received >= len(checks) - 1 first_request = False global num_metrics_received