From 1d266a6365565b30fc2d913bdf00490c8f51fe9e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 11 May 2023 16:09:29 +0200 Subject: [PATCH] logical size calculation metrics: differentiate regular vs imitated (#4197) I want this distinction so I can prove my assumption that the disk IO peaks which we see every 24h on prod are due to eviction's imitate synthetic size calculations. refs https://github.com/neondatabase/neon/issues/4154 --- pageserver/src/metrics.rs | 5 +++++ pageserver/src/tenant/timeline.rs | 12 +++++++++--- pageserver/src/tenant/timeline/eviction_task.rs | 7 ++++++- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ec2f49c85a..542fd511e1 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -30,6 +30,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[ "create images", "init logical size", "logical size", + "imitate logical size", "load layer map", "gc", ]; @@ -688,6 +689,7 @@ pub struct TimelineMetrics { pub compact_time_histo: StorageTimeMetrics, pub create_images_time_histo: StorageTimeMetrics, pub logical_size_histo: StorageTimeMetrics, + pub imitate_logical_size_histo: StorageTimeMetrics, pub load_layer_map_histo: StorageTimeMetrics, pub garbage_collect_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, @@ -720,6 +722,8 @@ impl TimelineMetrics { let create_images_time_histo = StorageTimeMetrics::new("create images", &tenant_id, &timeline_id); let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id); + let imitate_logical_size_histo = + StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id); let load_layer_map_histo = StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id); let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id); @@ -756,6 +760,7 @@ impl TimelineMetrics { compact_time_histo, create_images_time_histo, logical_size_histo, + imitate_logical_size_histo, garbage_collect_histo, load_layer_map_histo, last_record_gauge, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 90f2951aef..658b5d1289 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -49,7 +49,7 @@ use crate::tenant::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; +use crate::metrics::{StorageTimeMetrics, TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; @@ -1938,7 +1938,12 @@ impl Timeline { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation - .calculate_logical_size(lsn, cancel, &ctx) + .calculate_logical_size( + lsn, + &self_calculation.metrics.logical_size_histo, + cancel, + &ctx, + ) .await }); let timeline_state_cancellation = async { @@ -1993,6 +1998,7 @@ impl Timeline { pub async fn calculate_logical_size( &self, up_to_lsn: Lsn, + storage_time_metrics: &StorageTimeMetrics, cancel: CancellationToken, ctx: &RequestContext, ) -> Result { @@ -2026,7 +2032,7 @@ impl Timeline { if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) { return Ok(size); } - let timer = self.metrics.logical_size_histo.start_timer(); + let timer = storage_time_metrics.start_timer(); let logical_size = self .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx) .await?; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 523a5f8fa7..eb04e7e579 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -335,7 +335,12 @@ impl Timeline { // imitiate on-restart initial logical size let size = self - .calculate_logical_size(lsn, cancel.clone(), ctx) + .calculate_logical_size( + lsn, + &self.metrics.imitate_logical_size_histo, + cancel.clone(), + ctx, + ) .instrument(info_span!("calculate_logical_size")) .await;