diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 77c0967afc..30f3a49a9d 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -116,11 +116,17 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { +/// Measures layers visited per read (i.e. read amplification). +/// +/// NB: for a batch, we count all visited layers towards each read. While the cost of layer visits +/// are amortized across the batch, and some layers may not intersect with a given key, each visited +/// layer contributes directly to the observed latency for every read in the batch, which is what we +/// care about. +pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_layers_visited_per_vectored_read_global", - "Average number of layers visited to reconstruct one key", - vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + "pageserver_layers_per_read_global", + "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.", + vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); @@ -3912,7 +3918,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { // histograms [ - &VEC_READ_NUM_LAYERS_VISITED, + &LAYERS_PER_READ_GLOBAL, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b4b30fcd23..71f0b4c9bf 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -51,6 +51,7 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::rate_limit::RateLimit; use utils::{ fs_ext, guard_arc_swap::GuardArcSwap, @@ -115,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; -use crate::metrics::TimelineMetrics; +use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL}; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::config::TenantConfOpt; use pageserver_api::reltag::RelTag; @@ -1044,7 +1045,7 @@ impl Timeline { } pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; - pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; + pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100; /// Look up multiple page versions at a given LSN /// @@ -1221,25 +1222,27 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { - let avg = layers_visited as f64 / results.len() as f64; - if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = + // Record the total number of layers visited towards each key in the batch. While some + // layers may not intersect with a given read, and the cost of layer visits are + // amortized across the batch, each visited layer contributes directly to the observed + // latency for every read in the batch, which is what we care about. + if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { + static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { + LOG_PACER.lock().unwrap().call(|| { + let num_keys = keyspace.total_raw_size(); + let num_pages = results.len(); tracing::info!( shard_id = %self.tenant_shard_id.shard_slug(), lsn = %lsn, - "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", - keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", + ); }); } - // Note that this is an approximation. Tracking the exact number of layers visited - // per key requires virtually unbounded memory usage and is inefficient - // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); + for _ in &results { + LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64); + } } Ok(results) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 2edfc884ad..763a63c2e5 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -86,9 +86,9 @@ page_cache_size=10 log.info("Checking layer access metrics ...") layer_access_metric_names = [ - "pageserver_layers_visited_per_vectored_read_global_sum", - "pageserver_layers_visited_per_vectored_read_global_count", - "pageserver_layers_visited_per_vectored_read_global_bucket", + "pageserver_layers_per_read_global_sum", + "pageserver_layers_per_read_global_count", + "pageserver_layers_per_read_global_bucket", ] metrics = env.pageserver.http_client().get_metrics() @@ -96,8 +96,8 @@ page_cache_size=10 layer_access_metrics = metrics.query_all(name) log.info(f"Got metrics: {layer_access_metrics}") - vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") - vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + vectored_sum = metrics.query_one("pageserver_layers_per_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_per_read_global_count") if vectored_count.value > 0: assert vectored_sum.value > 0 vectored_average = vectored_sum.value / vectored_count.value