From 28e7fa98c4d8f8ef96fd2931f03543f8e06a2389 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 23 Apr 2024 14:05:02 +0100 Subject: [PATCH] pageserver: add read depth metrics and test (#7464) ## Problem We recently went through an incident where compaction was inhibited by a bug. We didn't observe this until quite late because we did not have alerting on deep reads. ## Summary of changes + Tweak an existing metric that tracks the depth of a read on the non-vectored read path: * Give it a better name * Track all layers * Larger buckets + Add a similar metric for the vectored read path + Add a compaction smoke test which uses these metrics. This test would have caught the compaction issue mentioned earlier. Related https://github.com/neondatabase/neon/issues/7428 --- pageserver/src/metrics.rs | 20 ++++-- pageserver/src/tenant/storage_layer.rs | 10 +++ pageserver/src/tenant/timeline.rs | 15 ++++- test_runner/fixtures/metrics.py | 2 +- test_runner/regress/test_compaction.py | 93 ++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 9 deletions(-) create mode 100644 test_runner/regress/test_compaction.py diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index e6db95082b..66bf21ddec 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -86,11 +86,20 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { register_histogram!( - "pageserver_read_num_fs_layers", - "Number of persistent layers accessed for processing a read request, including those in the cache", - vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0], + "pageserver_layers_visited_per_read_global", + "Number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], + ) + .expect("failed to define a metric") +}); + +pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_layers_visited_per_vectored_read_global", + "Average number of layers visited to reconstruct one key", + vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0], ) .expect("failed to define a metric") }); @@ -2771,7 +2780,8 @@ pub fn preinitialize_metrics() { // histograms [ - &READ_NUM_FS_LAYERS, + &READ_NUM_LAYERS_VISITED, + &VEC_READ_NUM_LAYERS_VISITED, &WAIT_LSN_TIME, &WAL_REDO_TIME, &WAL_REDO_RECORDS_HISTOGRAM, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9a2b086828..9ddd916700 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState { pub(crate) keys: HashMap>, keys_done: KeySpaceRandomAccum, + layers_visited: u32, } impl ValuesReconstructState { @@ -125,6 +126,7 @@ impl ValuesReconstructState { Self { keys: HashMap::new(), keys_done: KeySpaceRandomAccum::new(), + layers_visited: 0, } } @@ -138,6 +140,14 @@ impl ValuesReconstructState { } } + pub(crate) fn on_layer_visited(&mut self) { + self.layers_visited += 1; + } + + pub(crate) fn get_layers_visited(&self) -> u32 { + self.layers_visited + } + /// Update the state collected for a given key. /// Returns true if this was the last value needed for the key and false otherwise. /// diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index fb5ee0a8fa..2fbe3c63a2 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -973,6 +973,7 @@ impl Timeline { .await?; let mut results: BTreeMap> = BTreeMap::new(); + let layers_visited = reconstruct_state.get_layers_visited(); for (key, res) in reconstruct_state.keys { match res { Err(err) => { @@ -987,6 +988,12 @@ impl Timeline { } } + // Note that this is an approximation. Tracking the exact number of layers visited + // per key requires virtually unbounded memory usage and is inefficient + // (i.e. segment tree tracking each range queried from a layer) + crate::metrics::VEC_READ_NUM_LAYERS_VISITED + .observe(layers_visited as f64 / results.len() as f64); + Ok(results) } @@ -2813,7 +2820,7 @@ impl Timeline { let mut timeline = self; let mut read_count = scopeguard::guard(0, |cnt| { - crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64) + crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64) }); // For debugging purposes, collect the path of layers that we traversed @@ -2928,7 +2935,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` + *read_count += 1; traversal_path.push((result, cont_lsn, open_layer.traversal_id())); continue 'outer; } @@ -2955,7 +2962,7 @@ impl Timeline { Err(e) => return Err(PageReconstructError::from(e)), }; cont_lsn = lsn_floor; - // metrics: open_layer does not count as fs access, so we are not updating `read_count` + *read_count += 1; traversal_path.push((result, cont_lsn, frozen_layer.traversal_id())); continue 'outer; } @@ -3183,6 +3190,8 @@ impl Timeline { unmapped_keyspace = keyspace_to_read; cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(); } else { break; } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index c615dd154f..7d34e12ca3 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = ( "pageserver_getpage_reconstruct_seconds_sum", *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]], *histogram("pageserver_smgr_query_seconds_global"), - *histogram("pageserver_read_num_fs_layers"), + *histogram("pageserver_layers_visited_per_read_global"), *histogram("pageserver_getpage_get_reconstruct_data_seconds"), *histogram("pageserver_wait_lsn_seconds"), *histogram("pageserver_remote_operation_seconds"), diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py new file mode 100644 index 0000000000..37b87b92a9 --- /dev/null +++ b/test_runner/regress/test_compaction.py @@ -0,0 +1,93 @@ +import os + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.workload import Workload + +AGGRESIVE_COMPACTION_TENANT_CONF = { + # Disable gc and compaction. The test runs compaction manually. + "gc_period": "0s", + "compaction_period": "0s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 2, + # INC-186: remove when merging the fix + "image_layer_creation_check_threshold": 0, +} + + +@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build") +def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): + """ + This is a smoke test that compaction kicks in. The workload repeatedly churns + a small number of rows and manually instructs the pageserver to run compaction + between iterations. At the end of the test validate that the average number of + layers visited to gather reconstruct data for a given key is within the empirically + observed bounds. + """ + + # Effectively disable the page cache to rely only on image layers + # to shorten reads. + neon_env_builder.pageserver_config_override = """ +page_cache_size=10 +""" + + env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 10000 + churn_rounds = 100 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + if i % 10 == 0: + log.info(f"Running churn round {i}/{churn_rounds} ...") + + workload.churn_rows(row_count, env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + log.info("Checking layer access metrics ...") + + layer_access_metric_names = [ + "pageserver_layers_visited_per_read_global_sum", + "pageserver_layers_visited_per_read_global_count", + "pageserver_layers_visited_per_read_global_bucket", + "pageserver_layers_visited_per_vectored_read_global_sum", + "pageserver_layers_visited_per_vectored_read_global_count", + "pageserver_layers_visited_per_vectored_read_global_bucket", + ] + + metrics = env.pageserver.http_client().get_metrics() + for name in layer_access_metric_names: + layer_access_metrics = metrics.query_all(name) + log.info(f"Got metrics: {layer_access_metrics}") + + non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum") + non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count") + non_vectored_average = non_vectored_sum.value / non_vectored_count.value + + vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum") + vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count") + vectored_average = vectored_sum.value / vectored_count.value + + log.info(f"{non_vectored_average=} {vectored_average=}") + + # The upper bound for average number of layer visits below (8) + # was chosen empirically for this workload. + assert non_vectored_average < 8 + assert vectored_average < 8