pageserver: add pageserver_deltas_per_read_global metric (#10570)

## Problem

We suspect that Postgres checkpoints will limit the number of page
deltas necessary to reconstruct a page, but don't know for certain.

Touches https://github.com/neondatabase/cloud/issues/23283.

## Summary of changes

Add `pageserver_deltas_per_read_global` metric.

This pairs with `pageserver_layers_per_read_global` from #10573.
This commit is contained in:
Erik Grinaker
2025-01-30 11:55:07 +01:00
committed by GitHub
parent b24727134c
commit d3db96c211
3 changed files with 23 additions and 1 deletions

View File

@@ -131,6 +131,16 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
// We expect this to be low because of Postgres checkpoints. Let's see if that holds.
register_histogram!(
"pageserver_deltas_per_read_global",
"Number of delta pages applied to image page per read",
vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
)
.expect("failed to define a metric")
});
pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_concurrent_initdb",
@@ -3919,6 +3929,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
// histograms
[
&LAYERS_PER_READ_GLOBAL,
&DELTAS_PER_READ_GLOBAL,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,

View File

@@ -80,6 +80,16 @@ pub(crate) struct ValueReconstructState {
pub(crate) img: Option<(Lsn, Bytes)>,
}
impl ValueReconstructState {
/// Returns the number of page deltas applied to the page image.
pub fn num_deltas(&self) -> usize {
match self.img {
Some(_) => self.records.len(),
None => self.records.len() - 1, // omit will_init record
}
}
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub(crate) enum ValueReconstructSituation {
Complete,

View File

@@ -116,7 +116,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::metrics::{TimelineMetrics, LAYERS_PER_READ_GLOBAL};
use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::tenant::config::TenantConfOpt;
use pageserver_api::reltag::RelTag;
@@ -1195,6 +1195,7 @@ impl Timeline {
return (key, Err(err));
}
};
DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
// The walredo module expects the records to be descending in terms of Lsn.
// And we submit the IOs in that order, so, there shuold be no need to sort here.