From 2a3a8ee31d5ddf98a8b1e335034ddbdd2818dc12 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 24 Apr 2024 14:52:46 +0100 Subject: [PATCH] pageserver: publish the same metrics from both read paths (#7486) ## Problem Vectored and non-vectored read paths don't publish the same set of metrics. Metrics parity is needed for coalescing the read paths. ## Summary of changes * Publish reconstruct time and fetching data for reconstruct time from the vectored read path * Remove pageserver_getpage_reconstruct_seconds{res="err"} - wasn't used anyway --- pageserver/src/metrics.rs | 52 ++++++++++++++++++++++++------- pageserver/src/tenant/timeline.rs | 22 +++++++++++-- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 66bf21ddec..6ce7f286b3 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -105,31 +105,39 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { }); // Metrics collected on operations on the storage repository. +#[derive( + Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr, +)] +pub(crate) enum GetKind { + Singular, + Vectored, +} pub(crate) struct ReconstructTimeMetrics { - ok: Histogram, - err: Histogram, + singular: Histogram, + vectored: Histogram, } pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value (reconstruct a page from deltas)", - &["result"], + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric"); + ReconstructTimeMetrics { - ok: inner.get_metric_with_label_values(&["ok"]).unwrap(), - err: inner.get_metric_with_label_values(&["err"]).unwrap(), + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), } }); impl ReconstructTimeMetrics { - pub(crate) fn for_result(&self, result: &Result) -> &Histogram { - match result { - Ok(_) => &self.ok, - Err(_) => &self.err, + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, } } } @@ -142,13 +150,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::n .expect("failed to define a metric") }); -pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { - register_histogram!( +pub(crate) struct ReconstructDataTimeMetrics { + singular: Histogram, + vectored: Histogram, +} + +impl ReconstructDataTimeMetrics { + pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram { + match get_kind { + GetKind::Singular => &self.singular, + GetKind::Vectored => &self.vectored, + } + } +} + +pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( "pageserver_getpage_get_reconstruct_data_seconds", "Time spent in get_reconstruct_value_data", + &["get_kind"], CRITICAL_OP_BUCKETS.into(), ) - .expect("failed to define a metric") + .expect("failed to define a metric"); + + ReconstructDataTimeMetrics { + singular: inner.with_label_values(&[GetKind::Singular.into()]), + vectored: inner.with_label_values(&[GetKind::Vectored.into()]), + } }); pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 22b8a17874..11d96bf1a6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -86,7 +86,7 @@ use crate::{ use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::{ - TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, + GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use crate::tenant::config::TenantConfOpt; @@ -797,7 +797,9 @@ impl Timeline { img: cached_page_img, }; - let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer(); + let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(GetKind::Singular) + .start_timer(); let path = self .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx) .await?; @@ -807,7 +809,7 @@ impl Timeline { let res = self.reconstruct_value(key, lsn, reconstruct_state).await; let elapsed = start.elapsed(); crate::metrics::RECONSTRUCT_TIME - .for_result(&res) + .for_get_kind(GetKind::Singular) .observe(elapsed.as_secs_f64()); if cfg!(feature = "testing") && res.is_err() { @@ -969,9 +971,22 @@ impl Timeline { ) -> Result>, GetVectoredError> { let mut reconstruct_state = ValuesReconstructState::new(); + let get_kind = if keyspace.total_size() == 1 { + GetKind::Singular + } else { + GetKind::Vectored + }; + + let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME + .for_get_kind(get_kind) + .start_timer(); self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx) .await?; + get_data_timer.stop_and_record(); + let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME + .for_get_kind(get_kind) + .start_timer(); let mut results: BTreeMap> = BTreeMap::new(); let layers_visited = reconstruct_state.get_layers_visited(); for (key, res) in reconstruct_state.keys { @@ -987,6 +1002,7 @@ impl Timeline { } } } + reconstruct_timer.stop_and_record(); // Note that this is an approximation. Tracking the exact number of layers visited // per key requires virtually unbounded memory usage and is inefficient