pageserver: publish the same metrics from both read paths (#7486)

## Problem
Vectored and non-vectored read paths don't publish the same set of
metrics. Metrics parity is needed for coalescing the read paths.

## Summary of changes
* Publish reconstruct time and fetching data for reconstruct time from
the vectored read path
* Remove pageserver_getpage_reconstruct_seconds{res="err"} - wasn't used
anyway
This commit is contained in:
Vlad Lazar
2024-04-24 14:52:46 +01:00
committed by GitHub
parent 5dda371c2b
commit 2a3a8ee31d
2 changed files with 59 additions and 15 deletions

View File

@@ -105,31 +105,39 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
});
// Metrics collected on operations on the storage repository.
#[derive(
Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
)]
pub(crate) enum GetKind {
Singular,
Vectored,
}
pub(crate) struct ReconstructTimeMetrics {
ok: Histogram,
err: Histogram,
singular: Histogram,
vectored: Histogram,
}
pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value (reconstruct a page from deltas)",
&["result"],
&["get_kind"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric");
ReconstructTimeMetrics {
ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
err: inner.get_metric_with_label_values(&["err"]).unwrap(),
singular: inner.with_label_values(&[GetKind::Singular.into()]),
vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
}
});
impl ReconstructTimeMetrics {
pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
match result {
Ok(_) => &self.ok,
Err(_) => &self.err,
pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
match get_kind {
GetKind::Singular => &self.singular,
GetKind::Vectored => &self.vectored,
}
}
}
@@ -142,13 +150,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
pub(crate) struct ReconstructDataTimeMetrics {
singular: Histogram,
vectored: Histogram,
}
impl ReconstructDataTimeMetrics {
pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
match get_kind {
GetKind::Singular => &self.singular,
GetKind::Vectored => &self.vectored,
}
}
}
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
let inner = register_histogram_vec!(
"pageserver_getpage_get_reconstruct_data_seconds",
"Time spent in get_reconstruct_value_data",
&["get_kind"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
.expect("failed to define a metric");
ReconstructDataTimeMetrics {
singular: inner.with_label_values(&[GetKind::Singular.into()]),
vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
}
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {

View File

@@ -86,7 +86,7 @@ use crate::{
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::tenant::config::TenantConfOpt;
@@ -797,7 +797,9 @@ impl Timeline {
img: cached_page_img,
};
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(GetKind::Singular)
.start_timer();
let path = self
.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
@@ -807,7 +809,7 @@ impl Timeline {
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
let elapsed = start.elapsed();
crate::metrics::RECONSTRUCT_TIME
.for_result(&res)
.for_get_kind(GetKind::Singular)
.observe(elapsed.as_secs_f64());
if cfg!(feature = "testing") && res.is_err() {
@@ -969,9 +971,22 @@ impl Timeline {
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
let mut reconstruct_state = ValuesReconstructState::new();
let get_kind = if keyspace.total_size() == 1 {
GetKind::Singular
} else {
GetKind::Vectored
};
let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
.for_get_kind(get_kind)
.start_timer();
self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
.await?;
get_data_timer.stop_and_record();
let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
.for_get_kind(get_kind)
.start_timer();
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
let layers_visited = reconstruct_state.get_layers_visited();
for (key, res) in reconstruct_state.keys {
@@ -987,6 +1002,7 @@ impl Timeline {
}
}
}
reconstruct_timer.stop_and_record();
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient