test(pageserver): quantify compaction outcome (#7867)

A simple API to collect some statistics after compaction to easily
understand the result.

The tool reads the layer map, and analyze range by range instead of
doing single-key operations, which is more efficient than doing a
benchmark to collect the result. It currently computes two key metrics:

* Latest data access efficiency, which finds how many delta layers /
image layers the system needs to iterate before returning any key in a
key range.
* (Approximate) PiTR efficiency, as in
https://github.com/neondatabase/neon/issues/7770, which is simply the
number of delta files in the range. The reason behind that is, assume no
image layer is created, PiTR efficiency is simply the cost of collect
records from the delta layers, and the replay time. Number of delta
files (or in the future, estimated size of reads) is a simple yet
efficient way of estimating how much effort the page server needs to
reconstruct a page.

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z
2024-06-10 04:42:13 -04:00
committed by GitHub
parent 3b647cd55d
commit 3e63d0f9e0
6 changed files with 151 additions and 0 deletions

View File

@@ -2429,6 +2429,25 @@ async fn list_aux_files(
json_response(StatusCode::OK, files)
}
async fn perf_info(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline.perf_info().await;
json_response(StatusCode::OK, result)
}
async fn ingest_aux_files(
mut request: Request<Body>,
_cancel: CancellationToken,
@@ -2856,5 +2875,9 @@ pub fn make_router(
|r| testing_api_handler("list_aux_files", r, list_aux_files),
)
.post("/v1/top_tenants", |r| api_handler(r, post_top_tenants))
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
|r| testing_api_handler("perf_info", r, perf_info),
)
.any(handler_404))
}

View File

@@ -1,3 +1,4 @@
pub(crate) mod analysis;
mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;

View File

@@ -0,0 +1,90 @@
use std::{collections::BTreeSet, ops::Range};
use utils::lsn::Lsn;
use super::Timeline;
#[derive(serde::Serialize)]
pub(crate) struct RangeAnalysis {
start: String,
end: String,
has_image: bool,
num_of_deltas_above_image: usize,
total_num_of_deltas: usize,
}
impl Timeline {
pub(crate) async fn perf_info(&self) -> Vec<RangeAnalysis> {
// First, collect all split points of the layers.
let mut split_points = BTreeSet::new();
let mut delta_ranges = Vec::new();
let mut image_ranges = Vec::new();
let all_layer_files = {
let guard = self.layers.read().await;
guard.all_persistent_layers()
};
let lsn = self.get_last_record_lsn();
for key in all_layer_files {
split_points.insert(key.key_range.start);
split_points.insert(key.key_range.end);
if key.is_delta {
delta_ranges.push((key.key_range.clone(), key.lsn_range.clone()));
} else {
image_ranges.push((key.key_range.clone(), key.lsn_range.start));
}
}
// For each split range, compute the estimated read amplification.
let split_points = split_points.into_iter().collect::<Vec<_>>();
let mut result = Vec::new();
for i in 0..(split_points.len() - 1) {
let start = split_points[i];
let end = split_points[i + 1];
// Find the latest image layer that contains the information.
let mut maybe_image_layers = image_ranges
.iter()
// We insert split points for all image layers, and therefore a `contains` check for the start point should be enough.
.filter(|(key_range, img_lsn)| key_range.contains(&start) && img_lsn <= &lsn)
.cloned()
.collect::<Vec<_>>();
maybe_image_layers.sort_by(|a, b| a.1.cmp(&b.1));
let image_layer = maybe_image_layers.last().cloned();
let lsn_filter_start = image_layer
.as_ref()
.map(|(_, lsn)| *lsn)
.unwrap_or(Lsn::INVALID);
fn overlaps_with(lsn_range_a: &Range<Lsn>, lsn_range_b: &Range<Lsn>) -> bool {
!(lsn_range_a.end <= lsn_range_b.start || lsn_range_a.start >= lsn_range_b.end)
}
let maybe_delta_layers = delta_ranges
.iter()
.filter(|(key_range, lsn_range)| {
key_range.contains(&start) && overlaps_with(&(lsn_filter_start..lsn), lsn_range)
})
.cloned()
.collect::<Vec<_>>();
let pitr_delta_layers = delta_ranges
.iter()
.filter(|(key_range, _)| key_range.contains(&start))
.cloned()
.collect::<Vec<_>>();
result.push(RangeAnalysis {
start: start.to_string(),
end: end.to_string(),
has_image: image_layer.is_some(),
num_of_deltas_above_image: maybe_delta_layers.len(),
total_num_of_deltas: pitr_delta_layers.len(),
});
}
result
}
}

View File

@@ -1,4 +1,5 @@
use anyhow::{bail, ensure, Context, Result};
use itertools::Itertools;
use pageserver_api::shard::TenantShardId;
use std::{collections::HashMap, sync::Arc};
use tracing::trace;
@@ -308,6 +309,10 @@ impl LayerManager {
pub(crate) fn contains(&self, layer: &Layer) -> bool {
self.layer_fmgr.contains(layer)
}
pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
self.layer_fmgr.0.keys().cloned().collect_vec()
}
}
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);

View File

@@ -923,3 +923,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
)
self.verbose_error(res)
return res.json() # type: ignore
def perf_info(
self,
tenant_id: Union[TenantId, TenantShardId],
timeline_id: TimelineId,
):
self.is_testing_enabled_or_skip()
log.info(f"Requesting perf info: tenant {tenant_id}, timeline {timeline_id}")
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/perf_info",
)
log.info(f"Got perf info response code: {res.status_code}")
self.verbose_error(res)
return res.json()

View File

@@ -75,12 +75,29 @@ def test_gc_feedback(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma
physical_size = client.timeline_detail(tenant_id, timeline_id)["current_physical_size"]
log.info(f"Physical storage size {physical_size}")
max_num_of_deltas_above_image = 0
max_total_num_of_deltas = 0
for key_range in client.perf_info(tenant_id, timeline_id):
max_total_num_of_deltas = max(max_total_num_of_deltas, key_range["total_num_of_deltas"])
max_num_of_deltas_above_image = max(
max_num_of_deltas_above_image, key_range["num_of_deltas_above_image"]
)
MB = 1024 * 1024
zenbenchmark.record("logical_size", logical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER)
zenbenchmark.record("physical_size", physical_size // MB, "Mb", MetricReport.LOWER_IS_BETTER)
zenbenchmark.record(
"physical/logical ratio", physical_size / logical_size, "", MetricReport.LOWER_IS_BETTER
)
zenbenchmark.record(
"max_total_num_of_deltas", max_total_num_of_deltas, "", MetricReport.LOWER_IS_BETTER
)
zenbenchmark.record(
"max_num_of_deltas_above_image",
max_num_of_deltas_above_image,
"",
MetricReport.LOWER_IS_BETTER,
)
layer_map_path = env.repo_dir / "layer-map.json"
log.info(f"Writing layer map to {layer_map_path}")