Compare commits

..

1 Commits

Author SHA1 Message Date
Bojan Serafimov
639dcb24ff Add bench feature 2023-05-31 21:15:45 -04:00
25 changed files with 652 additions and 1031 deletions

1
Cargo.lock generated
View File

@@ -2530,7 +2530,6 @@ dependencies = [
"bytes",
"clap 4.3.0",
"git-version",
"itertools",
"pageserver",
"postgres_ffi",
"svg_fmt",

View File

@@ -33,7 +33,5 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
.init();
tracing::info!("logging and tracing started");
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
Ok(())
}

View File

@@ -52,7 +52,9 @@ completion, or shield the rest of the code from surprise cancellations
by spawning a separate task. The code that handles incoming HTTP
requests, for example, spawns a separate task for each request,
because Hyper will drop the request-handling Future if the HTTP
connection is lost.
connection is lost. (FIXME: our HTTP handlers do not do that
currently, but we should fix that. See [issue
3478](https://github.com/neondatabase/neon/issues/3478)).
#### How to cancel, then?

View File

@@ -40,12 +40,6 @@ struct RequestId(String);
///
/// This also handles errors, logging them and converting them to an HTTP error response.
///
/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
/// completion. In other words, the handler must be async cancellation safe! request_span
/// prints a warning to the log when that happens, so that you have some trace of it in
/// the log.
///
///
/// There could be other ways to implement similar functionality:
///
/// * procmacros placed on top of all handler methods

View File

@@ -9,6 +9,9 @@ default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
# Just a marker that compiles mock structs that are used in both tests and benchmarks. We
# hide them behind a feature flag so that we can apply stronger lints to prod-only code.
bench = []
[dependencies]
anyhow.workspace = true

View File

@@ -3,10 +3,10 @@
# How to run
To run all benchmarks:
`cargo bench`
`cargo bench --features bench`
To run a specific file:
`cargo bench --bench bench_layer_map`
`cargo bench --features bench --bench bench_layer_map`
To run a specific function:
`cargo bench --bench bench_layer_map -- real_map_uniform_queries`
`cargo bench --features bench --bench bench_layer_map -- real_map_uniform_queries`

View File

@@ -1,245 +1,264 @@
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant;
// Hiding this code under a compilation flag allows us to lint it differently than prod code
#[cfg(feature = "bench")]
pub mod bench {
use pageserver::keyspace::{KeyPartitioning, KeySpace};
use pageserver::repository::Key;
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::storage_layer::mock::LayerDescriptor;
use pageserver::tenant::storage_layer::{Layer, LayerFileName};
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
use std::cmp::{max, min};
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Instant;
use utils::lsn::Lsn;
use utils::lsn::Lsn;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
let mut layer_map = LayerMap::<LayerDescriptor>::default();
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
let mut layer_map = LayerMap::<LayerDescriptor>::default();
let mut min_lsn = Lsn(u64::MAX);
let mut max_lsn = Lsn(0);
let mut min_lsn = Lsn(u64::MAX);
let mut max_lsn = Lsn(0);
let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
let mut updates = layer_map.batch_update();
for fname in filenames {
let fname = fname.unwrap();
let fname = LayerFileName::from_str(&fname).unwrap();
let layer = LayerDescriptor::from(fname);
let mut updates = layer_map.batch_update();
for fname in filenames {
let fname = fname.unwrap();
let fname = LayerFileName::from_str(&fname).unwrap();
let layer = LayerDescriptor::from(fname);
let lsn_range = layer.get_lsn_range();
min_lsn = min(min_lsn, lsn_range.start);
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
let lsn_range = layer.get_lsn_range();
min_lsn = min(min_lsn, lsn_range.start);
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
updates.insert_historic(Arc::new(layer));
}
println!("min: {min_lsn}, max: {max_lsn}");
updates.flush();
layer_map
}
/// Construct a layer map query pattern for benchmarks
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
// For each image layer we query one of the pages contained, at LSN right
// before the image layer was created. This gives us a somewhat uniform
// coverage of both the lsn and key space because image layers have
// approximately equal sizes and cover approximately equal WAL since
// last image.
layer_map
.iter_historic_layers()
.filter_map(|l| {
if l.is_incremental() {
None
} else {
let kr = l.get_key_range();
let lr = l.get_lsn_range();
let key_inside = kr.start.next();
let lsn_before = Lsn(lr.start.0 - 1);
Some((key_inside, lsn_before))
}
})
.collect()
}
// Construct a partitioning for testing get_difficulty map when we
// don't have an exact result of `collect_keyspace` to work with.
fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
let mut parts = Vec::new();
// We add a partition boundary at the start of each image layer,
// no matter what lsn range it covers. This is just the easiest
// thing to do. A better thing to do would be to get a real
// partitioning from some database. Even better, remove the need
// for key partitions by deciding where to create image layers
// directly based on a coverage-based difficulty map.
let mut keys: Vec<_> = layer_map
.iter_historic_layers()
.filter_map(|l| {
if l.is_incremental() {
None
} else {
let kr = l.get_key_range();
Some(kr.start.next())
}
})
.collect();
keys.sort();
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
for key in keys {
parts.push(KeySpace {
ranges: vec![current_key..key],
});
current_key = key;
}
KeyPartitioning { parts }
}
// Benchmark using metadata extracted from our performance test environment, from
// a project where we have run pgbench many timmes. The pgbench database was initialized
// between each test run.
fn bench_from_captest_env(c: &mut Criterion) {
// TODO consider compressing this file
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Test with uniform query pattern
c.bench_function("captest_uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
black_box(layer_map.search(q.0, q.1));
}
});
});
// test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs.
c.bench_function("captest_rel_dir_query", |b| {
b.iter(|| {
let result = black_box(layer_map.search(
Key::from_hex("000000067F00008000000000000000000001").unwrap(),
// This LSN is higher than any of the LSNs in the tree
Lsn::from_str("D0/80208AE1").unwrap(),
));
result.unwrap();
});
});
}
// Benchmark using metadata extracted from a real project that was taknig
// too long processing layer map queries.
fn bench_from_real_project(c: &mut Criterion) {
// Init layer map
let now = Instant::now();
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
println!("Finished layer map init in {:?}", now.elapsed());
// Choose uniformly distributed queries
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Choose inputs for get_difficulty_map
let latest_lsn = layer_map
.iter_historic_layers()
.map(|l| l.get_lsn_range().end)
.max()
.unwrap();
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
// Check correctness of get_difficulty_map
// TODO put this in a dedicated test outside of this mod
{
println!("running correctness check");
let now = Instant::now();
let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
assert!(result_bruteforce.len() == partitioning.parts.len());
println!("Finished bruteforce in {:?}", now.elapsed());
let now = Instant::now();
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
assert!(result_fast.len() == partitioning.parts.len());
println!("Finished fast in {:?}", now.elapsed());
// Assert results are equal. Manually iterate for easier debugging.
let zip = std::iter::zip(
&partitioning.parts,
std::iter::zip(result_bruteforce, result_fast),
);
for (_part, (bruteforce, fast)) in zip {
assert_eq!(bruteforce, fast);
updates.insert_historic(Arc::new(layer));
}
println!("No issues found");
println!("min: {min_lsn}, max: {max_lsn}");
updates.flush();
layer_map
}
// Define and name the benchmark function
let mut group = c.benchmark_group("real_map");
group.bench_function("uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
black_box(layer_map.search(q.0, q.1));
}
});
});
group.bench_function("get_difficulty_map", |b| {
b.iter(|| {
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
});
});
group.finish();
}
/// Construct a layer map query pattern for benchmarks
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
// For each image layer we query one of the pages contained, at LSN right
// before the image layer was created. This gives us a somewhat uniform
// coverage of both the lsn and key space because image layers have
// approximately equal sizes and cover approximately equal WAL since
// last image.
layer_map
.iter_historic_layers()
.filter_map(|l| {
if l.is_incremental() {
None
} else {
let kr = l.get_key_range();
let lr = l.get_lsn_range();
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
fn bench_sequential(c: &mut Criterion) {
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
//
// TODO This code is pretty slow and runs even if we're only running other
// benchmarks. It needs to be somewhere else, but it's not clear where.
// Putting it inside the `bench_function` closure is not a solution
// because then it runs multiple times during warmup.
let now = Instant::now();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
for i in 0..100_000 {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = LayerDescriptor {
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
lsn: Lsn(i)..Lsn(i + 1),
is_incremental: false,
short_id: format!("Layer {}", i),
};
updates.insert_historic(Arc::new(layer));
let key_inside = kr.start.next();
let lsn_before = Lsn(lr.start.0 - 1);
Some((key_inside, lsn_before))
}
})
.collect()
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());
// Choose 100 uniformly random queries
let rng = &mut StdRng::seed_from_u64(1);
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
.choose_multiple(rng, 100)
.copied()
.collect();
// Construct a partitioning for testing get_difficulty map when we
// don't have an exact result of `collect_keyspace` to work with.
fn uniform_key_partitioning(
layer_map: &LayerMap<LayerDescriptor>,
_lsn: Lsn,
) -> KeyPartitioning {
let mut parts = Vec::new();
// Define and name the benchmark function
let mut group = c.benchmark_group("sequential");
group.bench_function("uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
black_box(layer_map.search(q.0, q.1));
}
// We add a partition boundary at the start of each image layer,
// no matter what lsn range it covers. This is just the easiest
// thing to do. A better thing to do would be to get a real
// partitioning from some database. Even better, remove the need
// for key partitions by deciding where to create image layers
// directly based on a coverage-based difficulty map.
let mut keys: Vec<_> = layer_map
.iter_historic_layers()
.filter_map(|l| {
if l.is_incremental() {
None
} else {
let kr = l.get_key_range();
Some(kr.start.next())
}
})
.collect();
keys.sort();
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
for key in keys {
parts.push(KeySpace {
ranges: vec![current_key..key],
});
current_key = key;
}
KeyPartitioning { parts }
}
// Benchmark using metadata extracted from our performance test environment, from
// a project where we have run pgbench many timmes. The pgbench database was initialized
// between each test run.
fn bench_from_captest_env(c: &mut Criterion) {
// TODO consider compressing this file
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Test with uniform query pattern
c.bench_function("captest_uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
black_box(layer_map.search(q.0, q.1));
}
});
});
});
group.finish();
// test with a key that corresponds to the RelDir entry. See pgdatadir_mapping.rs.
c.bench_function("captest_rel_dir_query", |b| {
b.iter(|| {
let result = black_box(layer_map.search(
Key::from_hex("000000067F00008000000000000000000001").unwrap(),
// This LSN is higher than any of the LSNs in the tree
Lsn::from_str("D0/80208AE1").unwrap(),
));
result.unwrap();
});
});
}
// Benchmark using metadata extracted from a real project that was taknig
// too long processing layer map queries.
fn bench_from_real_project(c: &mut Criterion) {
// Init layer map
let now = Instant::now();
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
println!("Finished layer map init in {:?}", now.elapsed());
// Choose uniformly distributed queries
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Choose inputs for get_difficulty_map
let latest_lsn = layer_map
.iter_historic_layers()
.map(|l| l.get_lsn_range().end)
.max()
.unwrap();
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
// Check correctness of get_difficulty_map
// TODO put this in a dedicated test outside of this mod
{
println!("running correctness check");
let now = Instant::now();
let result_bruteforce =
layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
assert!(result_bruteforce.len() == partitioning.parts.len());
println!("Finished bruteforce in {:?}", now.elapsed());
let now = Instant::now();
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
assert!(result_fast.len() == partitioning.parts.len());
println!("Finished fast in {:?}", now.elapsed());
// Assert results are equal. Manually iterate for easier debugging.
let zip = std::iter::zip(
&partitioning.parts,
std::iter::zip(result_bruteforce, result_fast),
);
for (_part, (bruteforce, fast)) in zip {
assert_eq!(bruteforce, fast);
}
println!("No issues found");
}
// Define and name the benchmark function
let mut group = c.benchmark_group("real_map");
group.bench_function("uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
black_box(layer_map.search(q.0, q.1));
}
});
});
group.bench_function("get_difficulty_map", |b| {
b.iter(|| {
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
});
});
group.finish();
}
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
fn bench_sequential(c: &mut Criterion) {
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
//
// TODO This code is pretty slow and runs even if we're only running other
// benchmarks. It needs to be somewhere else, but it's not clear where.
// Putting it inside the `bench_function` closure is not a solution
// because then it runs multiple times during warmup.
let now = Instant::now();
let mut layer_map = LayerMap::default();
let mut updates = layer_map.batch_update();
for i in 0..100_000 {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = LayerDescriptor {
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
lsn: Lsn(i)..Lsn(i + 1),
is_incremental: false,
short_id: format!("Layer {}", i),
};
updates.insert_historic(Arc::new(layer));
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());
// Choose 100 uniformly random queries
let rng = &mut StdRng::seed_from_u64(1);
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
.choose_multiple(rng, 100)
.copied()
.collect();
// Define and name the benchmark function
let mut group = c.benchmark_group("sequential");
group.bench_function("uniform_queries", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
black_box(layer_map.search(q.0, q.1));
}
});
});
group.finish();
}
criterion_group!(group_1, bench_from_captest_env);
criterion_group!(group_2, bench_from_real_project);
criterion_group!(group_3, bench_sequential);
}
criterion_group!(group_1, bench_from_captest_env);
criterion_group!(group_2, bench_from_real_project);
criterion_group!(group_3, bench_sequential);
criterion_main!(group_1, group_2, group_3);
#[cfg(feature = "bench")]
use criterion::criterion_main;
#[cfg(feature = "bench")]
criterion_main!(bench::group_1, bench::group_2, bench::group_3);
#[cfg(not(feature = "bench"))]
fn main() {
panic!("Use `--features bench` to run benchmarks")
}

View File

@@ -16,4 +16,3 @@ postgres_ffi.workspace = true
utils.workspace = true
svg_fmt.workspace = true
workspace_hack.workspace = true
itertools.workspace = true

View File

@@ -41,18 +41,9 @@ pub(crate) enum LayerCmd {
/// The id from list-layer command
id: usize,
},
/// Output layer statistics
GetStats {
path: PathBuf,
tenant: String,
timeline: String,
/// The id from list-layer command
id: usize,
},
}
// Return (key, value.len) for all keys, sorted by key.
fn read_delta_file(path: impl AsRef<Path>) -> Result<Vec<(Key, usize)>> {
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
use pageserver::tenant::blob_io::BlobCursor;
use pageserver::tenant::block_io::BlockReader;
@@ -79,48 +70,11 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<Vec<(Key, usize)>> {
},
)?;
let mut cursor = BlockCursor::new(&file);
let mut result = vec![];
for (k, v) in all {
let value = cursor.read_blob(v.pos())?;
result.push((k, value.len()));
println!("key:{} value_len:{}", k, value.len());
}
// TODO(chi): special handling for last key?
Ok(result)
}
// We divide the entire i128 keyspace into pre-assigned fixed segments,
// 8MB each. Group keys by segment, and report segment size for each.
//
// 8MB is chosen as the segment size because we're unlikely to make
// s3 partial downloads smaller than 8MB (due to cost). So summarizing
// layer metadata in 8MB segments could be enough to generate good test
// data for write amplification tests.
//
// Note that the segments are fixed, and don't depend on what keyspace
// is actually in use.
fn read_delta_segments(path: impl AsRef<Path>) -> Result<Vec<(i128, usize)>> {
fn key_to_segment(key: &Key) -> i128 {
// A page is 8KB. So 1024 pages are 8MB.
key.to_i128() >> 10
}
use itertools::Itertools;
let delta_metadata = read_delta_file(path)?;
let group_iter = delta_metadata.iter().group_by(|(k, _)| key_to_segment(k));
let group_sizes = group_iter.into_iter().map(|(segment, lengths_group)| {
let sum: usize = lengths_group.map(|(_k, len)| len).sum();
(segment, sum)
});
Ok(group_sizes.collect())
}
fn summarize_delta_file(path: impl AsRef<Path>) -> Result<()> {
// TODO write in some compressed binary format
for (segment, size) in read_delta_segments(path)? {
println!("segment:{} size:{}", segment, size);
}
Ok(())
}
@@ -199,38 +153,7 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
);
if layer_file.is_delta {
for (k, len) in read_delta_file(layer.path())? {
println!("key:{} value_len:{}", k, len);
}
} else {
anyhow::bail!("not supported yet :(");
}
break;
}
idx += 1;
}
}
}
LayerCmd::GetStats {
path,
tenant,
timeline,
id,
} => {
let timeline_path = path
.join("tenants")
.join(tenant)
.join("timelines")
.join(timeline);
let mut idx = 0;
for layer in fs::read_dir(timeline_path)? {
let layer = layer?;
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
{
if *id == idx {
if layer_file.is_delta {
summarize_delta_file(layer.path())?;
read_delta_file(layer.path())?;
} else {
anyhow::bail!("not supported yet :(");
}

View File

@@ -1,6 +1,3 @@
//!
//! Management HTTP API
//!
use std::collections::HashMap;
use std::sync::Arc;
@@ -49,6 +46,7 @@ use utils::{
};
// Imports only used for testing APIs
#[cfg(feature = "testing")]
use super::models::ConfigureFailpointsRequest;
struct State {
@@ -292,19 +290,13 @@ fn build_timeline_info_common(
}
// healthcheck handler
async fn status_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let config = get_config(&request);
json_response(StatusCode::OK, StatusResponse { id: config.id })
}
async fn timeline_create_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_id))?;
@@ -340,10 +332,7 @@ async fn timeline_create_handler(
.await
}
async fn timeline_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
@@ -377,10 +366,7 @@ async fn timeline_list_handler(
json_response(StatusCode::OK, response_data)
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size: Option<bool> =
@@ -414,10 +400,7 @@ async fn timeline_detail_handler(
json_response(StatusCode::OK, timeline_info)
}
async fn get_lsn_by_timestamp_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -441,10 +424,7 @@ async fn get_lsn_by_timestamp_handler(
json_response(StatusCode::OK, result)
}
async fn tenant_attach_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -480,10 +460,7 @@ async fn tenant_attach_handler(
json_response(StatusCode::ACCEPTED, ())
}
async fn timeline_delete_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -497,10 +474,7 @@ async fn timeline_delete_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_detach_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
@@ -514,10 +488,7 @@ async fn tenant_detach_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_load_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -537,10 +508,7 @@ async fn tenant_load_handler(
json_response(StatusCode::ACCEPTED, ())
}
async fn tenant_ignore_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -553,10 +521,7 @@ async fn tenant_ignore_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let response_data = mgr::list_tenants()
@@ -576,10 +541,7 @@ async fn tenant_list_handler(
json_response(StatusCode::OK, response_data)
}
async fn tenant_status(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -619,10 +581,7 @@ async fn tenant_status(
/// Note: we don't update the cached size and prometheus metric here.
/// The retention period might be different, and it's nice to have a method to just calculate it
/// without modifying anything anyway.
async fn tenant_size_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
@@ -687,10 +646,7 @@ async fn tenant_size_handler(
)
}
async fn layer_map_info_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let reset: LayerAccessStatsReset =
@@ -704,10 +660,7 @@ async fn layer_map_info_handler(
json_response(StatusCode::OK, layer_map_info)
}
async fn layer_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -730,10 +683,7 @@ async fn layer_download_handler(
}
}
async fn evict_timeline_layer_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -811,10 +761,7 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
Ok(response)
}
async fn tenant_create_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let request_data: TenantCreateRequest = json_request(&mut request).await?;
let target_tenant_id = request_data.new_tenant_id;
check_permission(&request, None)?;
@@ -861,10 +808,7 @@ async fn tenant_create_handler(
)
}
async fn get_tenant_config_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -890,7 +834,6 @@ async fn get_tenant_config_handler(
async fn update_tenant_config_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantConfigRequest = json_request(&mut request).await?;
let tenant_id = request_data.tenant_id;
@@ -908,10 +851,8 @@ async fn update_tenant_config_handler(
}
/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
async fn handle_tenant_break(
r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
@@ -923,10 +864,8 @@ async fn handle_tenant_break(
json_response(StatusCode::OK, ())
}
async fn failpoints_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow!(
"Cannot manage failpoints because pageserver was compiled without failpoints support"
@@ -959,10 +898,7 @@ async fn failpoints_handler(
}
// Run GC immediately on given timeline.
async fn timeline_gc_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -981,10 +917,8 @@ async fn timeline_gc_handler(
}
// Run compaction immediately on given timeline.
async fn timeline_compact_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -1005,10 +939,8 @@ async fn timeline_compact_handler(
}
// Run checkpoint immediately on given timeline.
async fn timeline_checkpoint_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -1032,7 +964,6 @@ async fn timeline_checkpoint_handler(
async fn timeline_download_remote_layers_handler_post(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1048,7 +979,6 @@ async fn timeline_download_remote_layers_handler_post(
async fn timeline_download_remote_layers_handler_get(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
@@ -1072,10 +1002,7 @@ async fn active_timeline_of_active_tenant(
.map_err(ApiError::NotFound)
}
async fn always_panic_handler(
req: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
// Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
// For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
// Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
@@ -1086,10 +1013,7 @@ async fn always_panic_handler(
json_response(StatusCode::NO_CONTENT, ())
}
async fn disk_usage_eviction_run(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
@@ -1179,10 +1103,8 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
)
}
async fn post_tracing_event_handler(
mut r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
#[cfg(feature = "testing")]
async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
#[derive(Debug, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
enum Level {
@@ -1212,85 +1134,6 @@ async fn post_tracing_event_handler(
json_response(StatusCode::OK, ())
}
/// Common functionality of all the HTTP API handlers.
///
/// - Adds a tracing span to each request (by `request_span`)
/// - Logs the request depending on the request method (by `request_span`)
/// - Logs the response if it was not successful (by `request_span`
/// - Shields the handler function from async cancellations. Hyper can drop the handler
/// Future if the connection to the client is lost, but most of the pageserver code is
/// not async cancellation safe. This converts the dropped future into a graceful cancellation
/// request with a CancellationToken.
async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
{
// Spawn a new task to handle the request, to protect the handler from unexpected
// async cancellations. Most pageserver functions are not async cancellation safe.
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
// with the cancellation token.
let token = CancellationToken::new();
let cancel_guard = token.clone().drop_guard();
let result = request_span(request, move |r| async {
let handle = tokio::spawn(
async {
let token_cloned = token.clone();
let result = handler(r, token).await;
if token_cloned.is_cancelled() {
info!("Cancelled request finished");
}
result
}
.in_current_span(),
);
match handle.await {
Ok(result) => result,
Err(e) => {
// The handler task panicked. We have a global panic handler that logs the
// panic with its backtrace, so no need to log that here. Only log a brief
// message to make it clear that we returned the error to the client.
error!("HTTP request handler task panicked: {e:#}");
// Don't return an Error here, because then fallback error handler that was
// installed in make_router() will print the error. Instead, construct the
// HTTP error response and return that.
Ok(
ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
.into_response(),
)
}
}
})
.await;
cancel_guard.disarm();
result
}
/// Like api_handler, but returns an error response if the server is built without
/// the 'testing' feature.
async fn testing_api_handler<R, H>(
desc: &str,
request: Request<Body>,
handler: H,
) -> Result<Response<Body>, ApiError>
where
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
{
if cfg!(feature = "testing") {
api_handler(request, handler).await
} else {
std::future::ready(Err(ApiError::BadRequest(anyhow!(
"Cannot {desc} because pageserver was compiled without testing APIs",
))))
.await
}
}
pub fn make_router(
conf: &'static PageServerConf,
launch_ts: &'static LaunchTimestamp,
@@ -1320,6 +1163,26 @@ pub fn make_router(
.expect("construct launch timestamp header middleware"),
);
macro_rules! testing_api {
($handler_desc:literal, $handler:path $(,)?) => {{
#[cfg(not(feature = "testing"))]
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
Err(ApiError::BadRequest(anyhow!(concat!(
"Cannot ",
$handler_desc,
" because pageserver was compiled without testing APIs",
))))
}
#[cfg(feature = "testing")]
let handler = $handler;
#[cfg(not(feature = "testing"))]
let handler = cfg_disabled;
move |r| request_span(r, handler)
}};
}
Ok(router
.data(Arc::new(
State::new(
@@ -1331,88 +1194,92 @@ pub fn make_router(
)
.context("Failed to initialize router state")?,
))
.get("/v1/status", |r| api_handler(r, status_handler))
.put("/v1/failpoints", |r| {
testing_api_handler("manage failpoints", r, failpoints_handler)
})
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
.get("/v1/status", |r| request_span(r, status_handler))
.put(
"/v1/failpoints",
testing_api!("manage failpoints", failpoints_handler),
)
.get("/v1/tenant", |r| request_span(r, tenant_list_handler))
.post("/v1/tenant", |r| request_span(r, tenant_create_handler))
.get("/v1/tenant/:tenant_id", |r| request_span(r, tenant_status))
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
api_handler(r, tenant_size_handler)
request_span(r, tenant_size_handler)
})
.put("/v1/tenant/config", |r| {
api_handler(r, update_tenant_config_handler)
request_span(r, update_tenant_config_handler)
})
.get("/v1/tenant/:tenant_id/config", |r| {
api_handler(r, get_tenant_config_handler)
request_span(r, get_tenant_config_handler)
})
.get("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_list_handler)
request_span(r, timeline_list_handler)
})
.post("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_create_handler)
request_span(r, timeline_create_handler)
})
.post("/v1/tenant/:tenant_id/attach", |r| {
api_handler(r, tenant_attach_handler)
request_span(r, tenant_attach_handler)
})
.post("/v1/tenant/:tenant_id/detach", |r| {
api_handler(r, tenant_detach_handler)
request_span(r, tenant_detach_handler)
})
.post("/v1/tenant/:tenant_id/load", |r| {
api_handler(r, tenant_load_handler)
request_span(r, tenant_load_handler)
})
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
request_span(r, tenant_ignore_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
request_span(r, timeline_detail_handler)
})
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|r| api_handler(r, get_lsn_by_timestamp_handler),
|r| request_span(r, get_lsn_by_timestamp_handler),
)
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
api_handler(r, timeline_gc_handler)
})
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
testing_api_handler("run timeline compaction", r, timeline_compact_handler)
request_span(r, timeline_gc_handler)
})
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
testing_api!("run timeline compaction", timeline_compact_handler),
)
.put(
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_post),
|r| request_span(r, timeline_download_remote_layers_handler_post),
)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_get),
|r| request_span(r, timeline_download_remote_layers_handler_get),
)
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_delete_handler)
request_span(r, timeline_delete_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
api_handler(r, layer_map_info_handler)
request_span(r, layer_map_info_handler)
})
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, layer_download_handler),
|r| request_span(r, layer_download_handler),
)
.delete(
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler),
|r| request_span(r, evict_timeline_layer_handler),
)
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
.post("/v1/tracing/event", |r| {
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
request_span(r, disk_usage_eviction_run)
})
.put(
"/v1/tenant/:tenant_id/break",
testing_api!("set tenant state to broken", handle_tenant_break),
)
.get("/v1/panic", |r| request_span(r, always_panic_handler))
.post(
"/v1/tracing/event",
testing_api!("emit a tracing event", post_tracing_event_handler),
)
.any(handler_404))
}

View File

@@ -84,16 +84,6 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_read_num_fs_layers",
"Number of persistent layers accessed for processing a read request, including those in the cache",
&["tenant_id", "timeline_id"],
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});
// Metrics collected on operations on the storage repository.
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
@@ -105,25 +95,6 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_getpage_get_reconstruct_data_seconds",
"Time spent in get_reconstruct_value_data",
&["tenant_id", "timeline_id"],
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_materialized_cache_hits_total",
@@ -383,7 +354,6 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
0.001000, // 1000 usec
0.030, // 30 ms
1.000, // 1000 ms
30.000, // 30000 ms
];
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -652,7 +622,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
"Time spent waiting for access to the WAL redo process",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
@@ -661,7 +631,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
"Histogram of number of records replayed per redo",
redo_histogram_count_buckets!(),
)
.expect("failed to define a metric")
@@ -670,7 +640,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_bytes_histogram",
"Histogram of number of records replayed per redo sent to Postgres",
"Histogram of number of records replayed per redo",
redo_bytes_histogram_count_buckets!(),
)
.expect("failed to define a metric")
@@ -753,9 +723,7 @@ pub struct TimelineMetrics {
tenant_id: String,
timeline_id: String,
pub reconstruct_time_histo: Histogram,
pub get_reconstruct_data_time_histo: Histogram,
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
@@ -766,7 +734,6 @@ pub struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub resident_physical_size_gauge: UIntGauge,
pub read_num_fs_layers: Histogram,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: UIntGauge,
pub num_persistent_files_created: IntCounter,
@@ -786,9 +753,6 @@ impl TimelineMetrics {
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
@@ -830,12 +794,6 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let read_num_fs_layers = READ_NUM_FS_LAYERS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
@@ -843,9 +801,7 @@ impl TimelineMetrics {
tenant_id,
timeline_id,
reconstruct_time_histo,
get_reconstruct_data_time_histo,
materialized_page_cache_hit_counter,
materialized_page_cache_hit_upon_request_counter,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
@@ -863,7 +819,6 @@ impl TimelineMetrics {
evictions_with_low_residence_duration: std::sync::RwLock::new(
evictions_with_low_residence_duration,
),
read_num_fs_layers,
}
}
}
@@ -873,9 +828,7 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -883,8 +836,6 @@ impl Drop for TimelineMetrics {
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
.unwrap()

View File

@@ -762,7 +762,8 @@ where
#[cfg(test)]
mod tests {
use super::{LayerMap, Replacement};
use crate::tenant::storage_layer::{Layer, LayerDescriptor, LayerFileName};
use crate::tenant::storage_layer::mock::LayerDescriptor;
use crate::tenant::storage_layer::{Layer, LayerFileName};
use std::str::FromStr;
use std::sync::Arc;

View File

@@ -1,8 +1,8 @@
use std::ops::Range;
// NOTE the `im` crate has 20x more downloads and also has
// persistent/immutable BTree. But it's bugged so rpds is a
// better choice https://github.com/neondatabase/neon/issues/3395
// TODO the `im` crate has 20x more downloads and also has
// persistent/immutable BTree. It also runs a bit faster but
// results are not the same on some tests.
use rpds::RedBlackTreeMapSync;
/// Data structure that can efficiently:

View File

@@ -779,6 +779,7 @@ pub async fn immediate_gc(
Ok(wait_task_done)
}
#[cfg(feature = "testing")]
pub async fn immediate_compact(
tenant_id: TenantId,
timeline_id: TimelineId,

View File

@@ -4,7 +4,6 @@ pub mod delta_layer;
mod filename;
mod image_layer;
mod inmemory_layer;
mod layer_desc;
mod remote_layer;
use crate::config::PageServerConf;
@@ -38,7 +37,6 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::PersistentLayerDesc;
pub use remote_layer::RemoteLayer;
use super::layer_map::BatchedUpdates;
@@ -408,23 +406,14 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
/// An image layer is a snapshot of all the data in a key-range, at a single
/// LSN.
pub trait PersistentLayer: Layer {
/// Get the layer descriptor.
fn layer_desc(&self) -> &PersistentLayerDesc;
fn get_tenant_id(&self) -> TenantId {
self.layer_desc().tenant_id
}
fn get_tenant_id(&self) -> TenantId;
/// Identify the timeline this layer belongs to
fn get_timeline_id(&self) -> TimelineId {
self.layer_desc().timeline_id
}
fn get_timeline_id(&self) -> TimelineId;
/// File name used for this layer, both in the pageserver's local filesystem
/// state as well as in the remote storage.
fn filename(&self) -> LayerFileName {
self.layer_desc().filename()
}
fn filename(&self) -> LayerFileName;
// Path to the layer file in the local filesystem.
// `None` for `RemoteLayer`.
@@ -471,80 +460,86 @@ pub fn downcast_remote_layer(
}
}
/// Holds metadata about a layer without any content. Used mostly for testing.
///
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
/// LayerDescriptor.
#[derive(Clone, Debug)]
pub struct LayerDescriptor {
pub key: Range<Key>,
pub lsn: Range<Lsn>,
pub is_incremental: bool,
pub short_id: String,
}
// Hiding this code under a compilation flag allows us to lint it differently than prod code
#[cfg(any(test, feature = "bench"))]
pub mod mock {
use super::*;
impl Layer for LayerDescriptor {
fn get_key_range(&self) -> Range<Key> {
self.key.clone()
/// Holds metadata about a layer without any content. Used mostly for testing.
///
/// To use filenames as fixtures, parse them as [`LayerFileName`] then convert from that to a
/// LayerDescriptor.
#[derive(Clone, Debug)]
pub struct LayerDescriptor {
pub key: Range<Key>,
pub lsn: Range<Lsn>,
pub is_incremental: bool,
pub short_id: String,
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn.clone()
}
impl Layer for LayerDescriptor {
fn get_key_range(&self) -> Range<Key> {
self.key.clone()
}
fn is_incremental(&self) -> bool {
self.is_incremental
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn.clone()
}
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_data: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
todo!("This method shouldn't be part of the Layer trait")
}
fn is_incremental(&self) -> bool {
self.is_incremental
}
fn short_id(&self) -> String {
self.short_id.clone()
}
fn get_value_reconstruct_data(
&self,
_key: Key,
_lsn_range: Range<Lsn>,
_reconstruct_data: &mut ValueReconstructState,
_ctx: &RequestContext,
) -> Result<ValueReconstructResult> {
todo!("This method shouldn't be part of the Layer trait")
}
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
todo!()
}
}
fn short_id(&self) -> String {
self.short_id.clone()
}
impl From<DeltaFileName> for LayerDescriptor {
fn from(value: DeltaFileName) -> Self {
let short_id = value.to_string();
LayerDescriptor {
key: value.key_range,
lsn: value.lsn_range,
is_incremental: true,
short_id,
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
todo!()
}
}
}
impl From<ImageFileName> for LayerDescriptor {
fn from(value: ImageFileName) -> Self {
let short_id = value.to_string();
let lsn = value.lsn_as_range();
LayerDescriptor {
key: value.key_range,
lsn,
is_incremental: false,
short_id,
impl From<DeltaFileName> for LayerDescriptor {
fn from(value: DeltaFileName) -> Self {
let short_id = value.to_string();
LayerDescriptor {
key: value.key_range,
lsn: value.lsn_range,
is_incremental: true,
short_id,
}
}
}
}
impl From<LayerFileName> for LayerDescriptor {
fn from(value: LayerFileName) -> Self {
match value {
LayerFileName::Delta(d) => Self::from(d),
LayerFileName::Image(i) => Self::from(i),
impl From<ImageFileName> for LayerDescriptor {
fn from(value: ImageFileName) -> Self {
let short_id = value.to_string();
let lsn = value.lsn_as_range();
LayerDescriptor {
key: value.key_range,
lsn,
is_incremental: false,
short_id,
}
}
}
impl From<LayerFileName> for LayerDescriptor {
fn from(value: LayerFileName) -> Self {
match value {
LayerFileName::Delta(d) => Self::from(d),
LayerFileName::Image(i) => Self::from(i),
}
}
}
}

View File

@@ -56,8 +56,8 @@ use utils::{
};
use super::{
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
PathOrConf, PersistentLayerDesc,
DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
LayerKeyIter, PathOrConf,
};
///
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
magic: DELTA_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id: layer.desc.tenant_id,
timeline_id: layer.desc.timeline_id,
key_range: layer.desc.key_range.clone(),
lsn_range: layer.desc.lsn_range.clone(),
tenant_id: layer.tenant_id,
timeline_id: layer.timeline_id,
key_range: layer.key_range.clone(),
lsn_range: layer.lsn_range.clone(),
index_start_blk: 0,
index_root_blk: 0,
@@ -180,7 +180,10 @@ impl DeltaKey {
pub struct DeltaLayer {
path_or_conf: PathOrConf,
pub desc: PersistentLayerDesc,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub file_size: u64,
@@ -194,8 +197,8 @@ impl std::fmt::Debug for DeltaLayer {
use super::RangeDisplayDebug;
f.debug_struct("DeltaLayer")
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
.field("lsn_range", &self.desc.lsn_range)
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("lsn_range", &self.lsn_range)
.field("file_size", &self.file_size)
.field("inner", &self.inner)
.finish()
@@ -225,16 +228,30 @@ impl std::fmt::Debug for DeltaLayerInner {
}
impl Layer for DeltaLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn is_incremental(&self) -> bool {
true
}
fn short_id(&self) -> String {
self.filename().file_name()
}
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.desc.lsn_range.start,
self.desc.lsn_range.end
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
if !verbose {
@@ -307,10 +324,10 @@ impl Layer for DeltaLayer {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.desc.lsn_range.start);
ensure!(lsn_range.start >= self.lsn_range.start);
let mut need_image = true;
ensure!(self.desc.key_range.contains(&key));
ensure!(self.key_range.contains(&key));
{
// Open the file and lock the metadata in memory
@@ -385,31 +402,19 @@ impl Layer for DeltaLayer {
Ok(ValueReconstructResult::Complete)
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
}
}
impl PersistentLayer for DeltaLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn filename(&self) -> LayerFileName {
self.layer_name().into()
}
fn local_path(&self) -> Option<PathBuf> {
@@ -597,12 +602,10 @@ impl DeltaLayer {
) -> DeltaLayer {
DeltaLayer {
path_or_conf: PathOrConf::Conf(conf),
desc: PersistentLayerDesc::new_delta(
tenant_id,
timeline_id,
filename.key_range.clone(),
filename.lsn_range.clone(),
),
timeline_id,
tenant_id,
key_range: filename.key_range.clone(),
lsn_range: filename.lsn_range.clone(),
file_size,
access_stats,
inner: RwLock::new(DeltaLayerInner {
@@ -629,12 +632,10 @@ impl DeltaLayer {
Ok(DeltaLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
desc: PersistentLayerDesc::new_delta(
summary.tenant_id,
summary.timeline_id,
summary.key_range,
summary.lsn_range,
),
timeline_id: summary.timeline_id,
tenant_id: summary.tenant_id,
key_range: summary.key_range,
lsn_range: summary.lsn_range,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(DeltaLayerInner {
@@ -647,14 +648,18 @@ impl DeltaLayer {
}
fn layer_name(&self) -> DeltaFileName {
self.desc.delta_file_name()
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
/// Path to the layer file in pageserver workdir.
pub fn path(&self) -> PathBuf {
Self::path_for(
&self.path_or_conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timeline_id,
self.tenant_id,
&self.layer_name(),
)
}
@@ -798,12 +803,10 @@ impl DeltaLayerWriterInner {
// set inner.file here. The first read will have to re-open it.
let layer = DeltaLayer {
path_or_conf: PathOrConf::Conf(self.conf),
desc: PersistentLayerDesc::new_delta(
self.tenant_id,
self.timeline_id,
self.key_start..key_end,
self.lsn_range.clone(),
),
tenant_id: self.tenant_id,
timeline_id: self.timeline_id,
key_range: self.key_start..key_end,
lsn_range: self.lsn_range.clone(),
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(DeltaLayerInner {

View File

@@ -9,8 +9,6 @@ use std::str::FromStr;
use utils::lsn::Lsn;
use super::PersistentLayerDesc;
// Note: Timeline::load_layer_map() relies on this sort order
#[derive(PartialEq, Eq, Clone, Hash)]
pub struct DeltaFileName {
@@ -155,7 +153,7 @@ impl Ord for ImageFileName {
impl ImageFileName {
pub fn lsn_as_range(&self) -> Range<Lsn> {
// Saves from having to copypaste this all over
PersistentLayerDesc::image_layer_lsn_range(self.lsn)
self.lsn..(self.lsn + 1)
}
}

View File

@@ -52,8 +52,8 @@ use utils::{
lsn::Lsn,
};
use super::filename::ImageFileName;
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
use super::filename::{ImageFileName, LayerFileName};
use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
///
/// Header stored in the beginning of the file
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
Self {
magic: IMAGE_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id: layer.desc.tenant_id,
timeline_id: layer.desc.timeline_id,
key_range: layer.desc.key_range.clone(),
tenant_id: layer.tenant_id,
timeline_id: layer.timeline_id,
key_range: layer.key_range.clone(),
lsn: layer.lsn,
index_start_blk: 0,
@@ -104,13 +104,14 @@ impl From<&ImageLayer> for Summary {
/// and it needs to be loaded before using it in queries.
pub struct ImageLayer {
path_or_conf: PathOrConf,
pub desc: PersistentLayerDesc,
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
pub lsn: Lsn,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub file_size: u64,
// This entry contains an image of all pages as of this LSN
pub lsn: Lsn,
access_stats: LayerAccessStats,
inner: RwLock<ImageLayerInner>,
@@ -121,7 +122,7 @@ impl std::fmt::Debug for ImageLayer {
use super::RangeDisplayDebug;
f.debug_struct("ImageLayer")
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
.field("key_range", &RangeDisplayDebug(&self.key_range))
.field("file_size", &self.file_size)
.field("lsn", &self.lsn)
.field("inner", &self.inner)
@@ -152,15 +153,27 @@ impl std::fmt::Debug for ImageLayerInner {
}
impl Layer for ImageLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
// End-bound is exclusive
self.lsn..(self.lsn + 1)
}
fn is_incremental(&self) -> bool {
false
}
fn short_id(&self) -> String {
self.filename().file_name()
}
/// debugging function to print out the contents of the layer
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
println!(
"----- image layer for ten {} tli {} key {}-{} at {} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.lsn
self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
);
if !verbose {
@@ -190,7 +203,7 @@ impl Layer for ImageLayer {
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
assert!(self.desc.key_range.contains(&key));
assert!(self.key_range.contains(&key));
assert!(lsn_range.start >= self.lsn);
assert!(lsn_range.end >= self.lsn);
@@ -217,37 +230,24 @@ impl Layer for ImageLayer {
Ok(ValueReconstructResult::Missing)
}
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
}
}
impl PersistentLayer for ImageLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
fn filename(&self) -> LayerFileName {
self.layer_name().into()
}
fn local_path(&self) -> Option<PathBuf> {
Some(self.path())
}
fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
unimplemented!();
}
@@ -405,13 +405,9 @@ impl ImageLayer {
) -> ImageLayer {
ImageLayer {
path_or_conf: PathOrConf::Conf(conf),
desc: PersistentLayerDesc::new_img(
tenant_id,
timeline_id,
filename.key_range.clone(),
filename.lsn,
false,
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
timeline_id,
tenant_id,
key_range: filename.key_range.clone(),
lsn: filename.lsn,
file_size,
access_stats,
@@ -437,13 +433,9 @@ impl ImageLayer {
.context("get file metadata to determine size")?;
Ok(ImageLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
desc: PersistentLayerDesc::new_img(
summary.tenant_id,
summary.timeline_id,
summary.key_range,
summary.lsn,
false,
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
timeline_id: summary.timeline_id,
tenant_id: summary.tenant_id,
key_range: summary.key_range,
lsn: summary.lsn,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -457,15 +449,18 @@ impl ImageLayer {
}
fn layer_name(&self) -> ImageFileName {
self.desc.image_file_name()
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn,
}
}
/// Path to the layer file in pageserver workdir.
pub fn path(&self) -> PathBuf {
Self::path_for(
&self.path_or_conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timeline_id,
self.tenant_id,
&self.layer_name(),
)
}
@@ -489,7 +484,6 @@ struct ImageLayerWriterInner {
tenant_id: TenantId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
blob_writer: WriteBlobWriter<VirtualFile>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -505,7 +499,6 @@ impl ImageLayerWriterInner {
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename.
// We'll atomically rename it to the final name when we're done.
@@ -540,7 +533,6 @@ impl ImageLayerWriterInner {
lsn,
tree: tree_builder,
blob_writer,
is_incremental,
};
Ok(writer)
@@ -578,14 +570,6 @@ impl ImageLayerWriterInner {
file.write_all(buf.as_ref())?;
}
let desc = PersistentLayerDesc::new_img(
self.tenant_id,
self.timeline_id,
self.key_range.clone(),
self.lsn,
self.is_incremental, // for now, image layer ALWAYS covers the full range
);
// Fill in the summary on blk 0
let summary = Summary {
magic: IMAGE_FILE_MAGIC,
@@ -609,7 +593,9 @@ impl ImageLayerWriterInner {
// set inner.file here. The first read will have to re-open it.
let layer = ImageLayer {
path_or_conf: PathOrConf::Conf(self.conf),
desc,
timeline_id: self.timeline_id,
tenant_id: self.tenant_id,
key_range: self.key_range.clone(),
lsn: self.lsn,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -681,7 +667,6 @@ impl ImageLayerWriter {
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(ImageLayerWriterInner::new(
@@ -690,7 +675,6 @@ impl ImageLayerWriter {
tenant_id,
key_range,
lsn,
is_incremental,
)?),
})
}

View File

@@ -1,109 +0,0 @@
use std::ops::Range;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::repository::Key;
use super::{DeltaFileName, ImageFileName, LayerFileName};
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct PersistentLayerDesc {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
/// For image layer, this is `[lsn, lsn+1)`.
pub lsn_range: Range<Lsn>,
/// Whether this is a delta layer.
pub is_delta: bool,
/// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
/// incremental.
pub is_incremental: bool,
}
impl PersistentLayerDesc {
pub fn short_id(&self) -> String {
self.filename().file_name()
}
pub fn new_img(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
) -> Self {
Self {
tenant_id,
timeline_id,
key_range,
lsn_range: Self::image_layer_lsn_range(lsn),
is_delta: false,
is_incremental,
}
}
pub fn new_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
) -> Self {
Self {
tenant_id,
timeline_id,
key_range,
lsn_range,
is_delta: true,
is_incremental: true,
}
}
/// Get the LSN that the image layer covers.
pub fn image_layer_lsn(&self) -> Lsn {
assert!(!self.is_delta);
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
self.lsn_range.start
}
/// Get the LSN range corresponding to a single image layer LSN.
pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
lsn..(lsn + 1)
}
/// Get a delta file name for this layer.
///
/// Panic: if this is not a delta layer.
pub fn delta_file_name(&self) -> DeltaFileName {
assert!(self.is_delta);
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
/// Get a delta file name for this layer.
///
/// Panic: if this is not an image layer, or the lsn range is invalid
pub fn image_file_name(&self) -> ImageFileName {
assert!(!self.is_delta);
assert!(self.lsn_range.start + 1 == self.lsn_range.end);
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
}
pub fn filename(&self) -> LayerFileName {
if self.is_delta {
self.delta_file_name().into()
} else {
self.image_file_name().into()
}
}
}

View File

@@ -18,10 +18,11 @@ use utils::{
lsn::Lsn,
};
use super::filename::{DeltaFileName, ImageFileName};
use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
use super::image_layer::ImageLayer;
use super::{
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
LayerResidenceStatus, PersistentLayer,
};
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -33,10 +34,19 @@ use super::{
///
/// See: [`crate::context::RequestContext`] for authorization to download
pub struct RemoteLayer {
pub desc: PersistentLayerDesc,
tenantid: TenantId,
timelineid: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
pub file_name: LayerFileName,
pub layer_metadata: LayerFileMetadata,
is_delta: bool,
is_incremental: bool,
access_stats: LayerAccessStats,
pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
@@ -56,14 +66,22 @@ pub struct RemoteLayer {
impl std::fmt::Debug for RemoteLayer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("RemoteLayer")
.field("file_name", &self.desc.filename())
.field("file_name", &self.file_name)
.field("layer_metadata", &self.layer_metadata)
.field("is_incremental", &self.desc.is_incremental)
.field("is_incremental", &self.is_incremental)
.finish()
}
}
impl Layer for RemoteLayer {
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn get_value_reconstruct_data(
&self,
_key: Key,
@@ -77,45 +95,53 @@ impl Layer for RemoteLayer {
);
}
fn is_incremental(&self) -> bool {
self.is_incremental
}
/// debugging function to print out the contents of the layer
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
println!(
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.desc.tenant_id,
self.desc.timeline_id,
self.desc.key_range.start,
self.desc.key_range.end,
self.desc.lsn_range.start,
self.desc.lsn_range.end
self.tenantid,
self.timelineid,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
Ok(())
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_key_range(&self) -> Range<Key> {
self.layer_desc().key_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn get_lsn_range(&self) -> Range<Lsn> {
self.layer_desc().lsn_range.clone()
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn is_incremental(&self) -> bool {
self.layer_desc().is_incremental
}
/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
fn short_id(&self) -> String {
self.layer_desc().short_id()
self.filename().file_name()
}
}
impl PersistentLayer for RemoteLayer {
fn layer_desc(&self) -> &PersistentLayerDesc {
&self.desc
fn get_tenant_id(&self) -> TenantId {
self.tenantid
}
fn get_timeline_id(&self) -> TimelineId {
self.timelineid
}
fn filename(&self) -> LayerFileName {
if self.is_delta {
DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
.into()
} else {
ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
}
.into()
}
}
fn local_path(&self) -> Option<PathBuf> {
@@ -150,7 +176,7 @@ impl PersistentLayer for RemoteLayer {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
if self.desc.is_delta {
if self.is_delta {
HistoricLayerInfo::Delta {
layer_file_name,
layer_file_size: self.layer_metadata.file_size(),
@@ -184,13 +210,13 @@ impl RemoteLayer {
access_stats: LayerAccessStats,
) -> RemoteLayer {
RemoteLayer {
desc: PersistentLayerDesc::new_img(
tenantid,
timelineid,
fname.key_range.clone(),
fname.lsn,
false,
),
tenantid,
timelineid,
key_range: fname.key_range.clone(),
lsn_range: fname.lsn_as_range(),
is_delta: false,
is_incremental: false,
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -206,12 +232,13 @@ impl RemoteLayer {
access_stats: LayerAccessStats,
) -> RemoteLayer {
RemoteLayer {
desc: PersistentLayerDesc::new_delta(
tenantid,
timelineid,
fname.key_range.clone(),
fname.lsn_range.clone(),
),
tenantid,
timelineid,
key_range: fname.key_range.clone(),
lsn_range: fname.lsn_range.clone(),
is_delta: true,
is_incremental: true,
file_name: fname.to_owned().into(),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -229,12 +256,15 @@ impl RemoteLayer {
where
L: ?Sized + Layer,
{
if self.desc.is_delta {
let fname = self.desc.delta_file_name();
if self.is_delta {
let fname = DeltaFileName {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
};
Arc::new(DeltaLayer::new(
conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timelineid,
self.tenantid,
&fname,
file_size,
self.access_stats.clone_for_residence_change(
@@ -243,11 +273,14 @@ impl RemoteLayer {
),
))
} else {
let fname = self.desc.image_file_name();
let fname = ImageFileName {
key_range: self.key_range.clone(),
lsn: self.lsn_range.start,
};
Arc::new(ImageLayer::new(
conf,
self.desc.timeline_id,
self.desc.tenant_id,
self.timelineid,
self.tenantid,
&fname,
file_size,
self.access_stats.clone_for_residence_change(

View File

@@ -525,12 +525,7 @@ impl Timeline {
Some((cached_lsn, cached_img)) => {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
Ordering::Equal => {
self.metrics
.materialized_page_cache_hit_upon_request_counter
.inc();
return Ok(cached_img); // exact LSN match, return the image
}
Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
Ordering::Greater => {
unreachable!("the returned lsn should never be after the requested lsn")
}
@@ -545,10 +540,8 @@ impl Timeline {
img: cached_page_img,
};
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
.await?;
timer.stop_and_record();
self.metrics
.reconstruct_time_histo
@@ -2268,9 +2261,6 @@ impl Timeline {
let mut timeline_owned;
let mut timeline = self;
let mut read_count =
scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
// For debugging purposes, collect the path of layers that we traversed
// through. It's included in the error message if we fail to find the key.
let mut traversal_path = Vec::<TraversalPathItem>::new();
@@ -2405,7 +2395,6 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
traversal_path.push((
result,
cont_lsn,
@@ -2432,7 +2421,6 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
traversal_path.push((
result,
cont_lsn,
@@ -2467,7 +2455,6 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
*read_count += 1;
traversal_path.push((
result,
cont_lsn,
@@ -2533,7 +2520,7 @@ impl Timeline {
(DownloadBehavior::Error, false) => {
return Err(PageReconstructError::NeedsDownload(
TenantTimelineId::new(self.tenant_id, self.timeline_id),
remote_layer.filename(),
remote_layer.file_name.clone(),
))
}
}
@@ -3066,7 +3053,6 @@ impl Timeline {
self.tenant_id,
&img_range,
lsn,
false, // image layer always covers the full range
)?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
@@ -4127,7 +4113,7 @@ impl Timeline {
// Does retries + exponential back-off internally.
// When this fails, don't layer further retry attempts here.
let result = remote_client
.download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata)
.download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
.await;
if let Ok(size) = &result {

View File

@@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::sync::CancellationToken;
use utils::{project_git_version, sentry_init::init_sentry};
use tracing::{error, info, warn, Instrument};
use tracing::{error, info, warn};
project_git_version!(GIT_VERSION);
@@ -141,6 +141,7 @@ async fn task_main(
tokio::select! {
accept_result = listener.accept() => {
let (socket, peer_addr) = accept_result?;
info!("accepted postgres client connection from {peer_addr}");
let session_id = uuid::Uuid::new_v4();
let tls_config = Arc::clone(&tls_config);
@@ -148,18 +149,18 @@ async fn task_main(
connections.spawn(
async move {
info!("spawned a task for {peer_addr}");
socket
.set_nodelay(true)
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, socket).await
handle_client(dest_suffix, tls_config, session_id, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
error!("per-client task finished with an error: {e:#}");
})
.instrument(tracing::info_span!("handle_client", ?session_id))
}),
);
}
_ = cancellation_token.cancelled() => {
@@ -191,6 +192,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
let mut stream = PqStream::new(Stream::from_raw(raw_stream));
let msg = stream.read_startup_packet().await?;
info!("received {msg:?}");
use pq_proto::FeStartupPacket::*;
match msg {
@@ -213,19 +215,15 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
Ok(raw.upgrade(tls_config).await?)
}
unexpected => {
info!(
?unexpected,
"unexpected startup packet, rejecting connection"
);
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
}
_ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
}
}
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
async fn handle_client(
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
session_id: uuid::Uuid,
stream: impl AsyncRead + AsyncWrite + Unpin,
) -> anyhow::Result<()> {
let tls_stream = ssl_handshake(stream, tls_config).await?;

View File

@@ -65,19 +65,12 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_getpage_reconstruct_seconds_bucket",
"pageserver_getpage_reconstruct_seconds_count",
"pageserver_getpage_reconstruct_seconds_sum",
"pageserver_getpage_get_reconstruct_data_seconds_bucket",
"pageserver_getpage_get_reconstruct_data_seconds_count",
"pageserver_getpage_get_reconstruct_data_seconds_sum",
"pageserver_io_operations_bytes_total",
"pageserver_io_operations_seconds_bucket",
"pageserver_io_operations_seconds_count",
"pageserver_io_operations_seconds_sum",
"pageserver_last_record_lsn",
"pageserver_materialized_cache_hits_total",
"pageserver_materialized_cache_hits_direct_total",
"pageserver_read_num_fs_layers_bucket",
"pageserver_read_num_fs_layers_count",
"pageserver_read_num_fs_layers_sum",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
"pageserver_smgr_query_seconds_sum",

View File

@@ -37,7 +37,6 @@ class PgSniRouter(PgProtocol):
destination: str,
tls_cert: Path,
tls_key: Path,
test_output_dir: Path,
):
# Must use a hostname rather than IP here, for SNI to work
host = "localhost"
@@ -50,7 +49,6 @@ class PgSniRouter(PgProtocol):
self.tls_cert = tls_cert
self.tls_key = tls_key
self._popen: Optional[subprocess.Popen[bytes]] = None
self.test_output_dir = test_output_dir
def start(self) -> "PgSniRouter":
assert self._popen is None
@@ -62,12 +60,8 @@ class PgSniRouter(PgProtocol):
*["--destination", self.destination],
]
router_log_path = self.test_output_dir / "pg_sni_router.log"
router_log = open(router_log_path, "w")
self._popen = subprocess.Popen(args, stderr=router_log)
self._popen = subprocess.Popen(args)
self._wait_until_ready()
log.info(f"pg_sni_router started, log file: {router_log_path}")
return self
@backoff.on_exception(backoff.expo, OSError, max_time=10)
@@ -127,7 +121,6 @@ def test_pg_sni_router(
destination="localtest.me",
tls_cert=test_output_dir / "router.crt",
tls_key=test_output_dir / "router.key",
test_output_dir=test_output_dir,
) as router:
router.start()

View File

@@ -437,22 +437,12 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
wait_until(50, 0.1, got_hangup_log_message)
# check that the timeline is still present
ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
# ok, disable the failpoint to let the deletion finish
# ok, retry without failpoint, it should succeed
ps_http.configure_failpoints((failpoint_name, "off"))
def first_request_finished():
message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
assert env.pageserver.log_contains(message)
wait_until(50, 0.1, first_request_finished)
# check that the timeline is gone
notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found"
env.pageserver.allowed_errors.append(".*" + notfound_message)
with pytest.raises(PageserverApiException, match=notfound_message) as exc:
ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
assert exc.value.status_code == 404
# this should succeed
ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
# the second call will try to transition the timeline into Stopping state, but it's already in that state
env.pageserver.allowed_errors.append(
f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
)