From ea5b36c47a593f67116db71841473ce3b95c1ecd Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 22 Apr 2024 13:48:50 +0100 Subject: [PATCH] pageserver: tweak metric counting layer visits on non-vectored read path * Give it a better name * Track all layers * Larger buckets --- pageserver/src/metrics.rs | 12 ++++++++---- pageserver/src/tenant/timeline.rs | 6 +++--- test_runner/fixtures/metrics.py | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e6db95082b..09a71b0498 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -86,11 +86,14 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_read_num_fs_layers", - "Number of persistent layers accessed for processing a read request, including those in the cache", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + "pageserver_layers_visited_per_read_global", + "Number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); ) .expect("failed to define a metric") }); @@ -2772,6 +2775,7 @@ pub fn preinitialize_metrics() { // histograms [ &READ_NUM_FS_LAYERS, + &READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e707c3b244..7a07618f48 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2794,7 +2794,7 @@ impl Timeline { let mut timeline = self; let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) + crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) }); // For debugging purposes, collect the path of layers that we traversed @@ -2909,7 +2909,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` + *read_count += 1; traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue 'outer; } @@ -2936,7 +2936,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` + *read_count += 1; traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c615dd154f..7d34e12ca3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"),