Update WAL redo histograms (#2323)

Previously, it could only distinguish REDO task durations down to 5ms, which equates to approx. 200pages/sec or 1.6MB/sec getpage@LSN traffic. This patch improves to 200'000 pages/sec or 1.6GB/sec, allowing for much more precise performance measurement of the redo process.
2026-01-14 00:42:54 +00:00 · 2022-08-25 16:17:32 +02:00
parent c952f022bb
commit bc588f3a53
1 changed files with 43 additions and 3 deletions
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -89,15 +89,52 @@ pub trait WalRedoManager: Send + Sync {
 // for access to the postgres process ('wait') since there is only one for
 // each tenant.

+/// Time buckets are small because we want to be able to measure the
+/// smallest redo processing times. These buckets allow us to measure down
+/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
+/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
+macro_rules! redo_histogram_time_buckets {
+    () => {
+        vec![
+            0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
+            0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
+        ]
+    };
+}
+
+/// While we're at it, also measure the amount of records replayed in each
+/// operation. We have a global 'total replayed' counter, but that's not
+/// as useful as 'what is the skew for how many records we replay in one
+/// operation'.
+macro_rules! redo_histogram_count_buckets {
+    () => {
+        vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
+    };
+}
+
 static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
-        .expect("failed to define a metric")
+    register_histogram!(
+        "pageserver_wal_redo_seconds",
+        "Time spent on WAL redo",
+        redo_histogram_time_buckets!()
+    )
+    .expect("failed to define a metric")
 });

 static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the WAL redo process"
+        "Time spent waiting for access to the WAL redo process",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
+static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_records_histogram",
+        "Histogram of number of records replayed per redo",
+        redo_histogram_count_buckets!(),
    )
    .expect("failed to define a metric")
 });
@@ -262,7 +299,10 @@ impl PostgresRedoManager {

        let end_time = Instant::now();
        let duration = end_time.duration_since(lock_time);
+
        WAL_REDO_TIME.observe(duration.as_secs_f64());
+        WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
+
        debug!(
            "postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
            records.len(),