From 5b34d5f561f4565dd22dd5e9be58a9844c6a5476 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Fri, 26 Jan 2024 13:40:03 +0000 Subject: [PATCH] pageserver: add vectored get latency histogram (#6461) This patch introduces a new set of grafana metrics for a histogram: pageserver_get_vectored_seconds_bucket{task_kind="Compaction|PageRequestHandler"}. While it has a `task_kind` label, only compaction and SLRU fetches are tracked. This reduces the increase in cardinality to 24. The metric should allow us to isolate performance regressions while the vectorized get is being implemented. Once the implementation is complete, it'll also allow us to quantify the improvements. --- pageserver/src/metrics.rs | 37 +++++++++++++++++++++++++++++++ pageserver/src/tenant/timeline.rs | 4 ++++ 2 files changed, 41 insertions(+) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 2cfa77f1c5..9b3679e3c2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -150,6 +150,43 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +pub(crate) struct GetVectoredLatency { + map: EnumMap>, +} + +impl GetVectoredLatency { + // Only these task types perform vectored gets. Filter all other tasks out to reduce total + // cardinality of the metric. + const TRACKED_TASK_KINDS: [TaskKind; 2] = [TaskKind::Compaction, TaskKind::PageRequestHandler]; + + pub(crate) fn for_task_kind(&self, task_kind: TaskKind) -> Option<&Histogram> { + self.map[task_kind].as_ref() + } +} + +pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { + let inner = register_histogram_vec!( + "pageserver_get_vectored_seconds", + "Time spent in get_vectored", + &["task_kind"], + CRITICAL_OP_BUCKETS.into(), + ) + .expect("failed to define a metric"); + + GetVectoredLatency { + map: EnumMap::from_array(std::array::from_fn(|task_kind_idx| { + let task_kind = ::from_usize(task_kind_idx); + + if GetVectoredLatency::TRACKED_TASK_KINDS.contains(&task_kind) { + let task_kind = task_kind.into(); + Some(inner.with_label_values(&[task_kind])) + } else { + None + } + })), + } +}); + pub(crate) struct PageCacheMetricsForTaskKind { pub read_accesses_materialized_page: IntCounter, pub read_accesses_immutable: IntCounter, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c21fe94d01..70c6ee2042 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -678,6 +678,10 @@ impl Timeline { return Err(GetVectoredError::Oversized(key_count)); } + let _timer = crate::metrics::GET_VECTORED_LATENCY + .for_task_kind(ctx.task_kind()) + .map(|t| t.start_timer()); + let mut values = BTreeMap::new(); for range in key_ranges { let mut key = range.start;