mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 03:20:36 +00:00
metrics: distinguish page reconstruction success & failure (#5463)
Here's the existing dashboards that use the metric: https://github.com/search?q=repo%3Aneondatabase%2Fgrafana-dashboard-export%20pageserver_getpage_reconstruct_seconds&type=code Looks like only `_count` and `_sum` values are used currently. We can fix them up easily post merge. I think the histogram is worth keeping, though. follow-up to https://github.com/neondatabase/neon/pull/5459#pullrequestreview-1657072882 --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>
This commit is contained in:
committed by
GitHub
parent
dee2bcca44
commit
25bf791568
@@ -94,15 +94,35 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
|
||||
pub(crate) struct ReconstructTimeMetrics {
|
||||
ok: Histogram,
|
||||
err: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value (reconstruct a page from deltas)",
|
||||
&["result"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
.expect("failed to define a metric");
|
||||
ReconstructTimeMetrics {
|
||||
ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
|
||||
err: inner.get_metric_with_label_values(&["err"]).unwrap(),
|
||||
}
|
||||
});
|
||||
|
||||
impl ReconstructTimeMetrics {
|
||||
pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
|
||||
match result {
|
||||
Ok(_) => &self.ok,
|
||||
Err(_) => &self.err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
@@ -1856,7 +1876,6 @@ pub fn preinitialize_metrics() {
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&RECONSTRUCT_TIME,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_WAIT_TIME,
|
||||
@@ -1867,4 +1886,7 @@ pub fn preinitialize_metrics() {
|
||||
.for_each(|h| {
|
||||
Lazy::force(h);
|
||||
});
|
||||
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@ use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
|
||||
use crate::metrics::{
|
||||
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
RECONSTRUCT_TIME, UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
@@ -501,9 +501,12 @@ impl Timeline {
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
let timer = RECONSTRUCT_TIME.start_timer();
|
||||
let start = Instant::now();
|
||||
let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
|
||||
timer.stop_and_record();
|
||||
let elapsed = start.elapsed();
|
||||
crate::metrics::RECONSTRUCT_TIME
|
||||
.for_result(&res)
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if cfg!(feature = "testing") && res.is_err() {
|
||||
// it can only be walredo issue
|
||||
|
||||
Reference in New Issue
Block a user