mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 23:20:40 +00:00
Compare commits
3 Commits
vlad/storc
...
exponentia
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4a4c1b06eb | ||
|
|
a59720b5f5 | ||
|
|
65c519f36c |
@@ -104,6 +104,9 @@ pub struct ConfigToml {
|
||||
pub image_compression: ImageCompressionAlgorithm,
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
#[serde(skip_serializing)]
|
||||
// TODO(https://github.com/neondatabase/neon/issues/8184): remove after this field is removed from all pageserver.toml's
|
||||
pub compact_level0_phase1_value_access: serde::de::IgnoredAny,
|
||||
pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
@@ -381,6 +384,7 @@ impl Default for ConfigToml {
|
||||
image_compression: (DEFAULT_IMAGE_COMPRESSION),
|
||||
ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
l0_flush: None,
|
||||
compact_level0_phase1_value_access: Default::default(),
|
||||
virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
|
||||
|
||||
io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Quantify a single walredo manager's throughput under N concurrent callers.
|
||||
//!
|
||||
//! The benchmark implementation ([`bench_impl`]) is parametrized by
|
||||
//! - `redo_work` => an async closure that takes a `PostgresRedoManager` and performs one redo
|
||||
//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`]
|
||||
//! - `n_redos` => number of times the benchmark shell execute the `redo_work`
|
||||
//! - `nclients` => number of clients (more on this shortly).
|
||||
//!
|
||||
@@ -10,7 +10,7 @@
|
||||
//! Each task executes the `redo_work` `n_redos/nclients` times.
|
||||
//!
|
||||
//! We exercise the following combinations:
|
||||
//! - `redo_work = ping / short / medium``
|
||||
//! - `redo_work = short / medium``
|
||||
//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]`
|
||||
//!
|
||||
//! We let `criterion` determine the `n_redos` using `iter_custom`.
|
||||
@@ -27,43 +27,33 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-09-18 on im4gn.2xlarge
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! ping/1 time: [21.789 µs 21.918 µs 22.078 µs]
|
||||
//! ping/2 time: [27.686 µs 27.812 µs 27.970 µs]
|
||||
//! ping/4 time: [35.468 µs 35.671 µs 35.926 µs]
|
||||
//! ping/8 time: [59.682 µs 59.987 µs 60.363 µs]
|
||||
//! ping/16 time: [101.79 µs 102.37 µs 103.08 µs]
|
||||
//! ping/32 time: [184.18 µs 185.15 µs 186.36 µs]
|
||||
//! ping/64 time: [349.86 µs 351.45 µs 353.47 µs]
|
||||
//! ping/128 time: [684.53 µs 687.98 µs 692.17 µs]
|
||||
//! short/1 time: [31.833 µs 32.126 µs 32.428 µs]
|
||||
//! short/2 time: [35.558 µs 35.756 µs 35.992 µs]
|
||||
//! short/4 time: [44.850 µs 45.138 µs 45.484 µs]
|
||||
//! short/8 time: [65.985 µs 66.379 µs 66.853 µs]
|
||||
//! short/16 time: [127.06 µs 127.90 µs 128.87 µs]
|
||||
//! short/32 time: [252.98 µs 254.70 µs 256.73 µs]
|
||||
//! short/64 time: [497.13 µs 499.86 µs 503.26 µs]
|
||||
//! short/128 time: [987.46 µs 993.45 µs 1.0004 ms]
|
||||
//! medium/1 time: [137.91 µs 138.55 µs 139.35 µs]
|
||||
//! medium/2 time: [192.00 µs 192.91 µs 194.07 µs]
|
||||
//! medium/4 time: [389.62 µs 391.55 µs 394.01 µs]
|
||||
//! medium/8 time: [776.80 µs 780.33 µs 784.77 µs]
|
||||
//! medium/16 time: [1.5323 ms 1.5383 ms 1.5459 ms]
|
||||
//! medium/32 time: [3.0120 ms 3.0226 ms 3.0350 ms]
|
||||
//! medium/64 time: [5.7405 ms 5.7787 ms 5.8166 ms]
|
||||
//! medium/128 time: [10.412 ms 10.574 ms 10.718 ms]
|
||||
//! short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! ```
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
@@ -71,59 +61,40 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
macro_rules! bench_group {
|
||||
($name:expr, $redo_work:expr) => {{
|
||||
let name: &str = $name;
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(name);
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
b.iter_custom(|iters| bench_impl($redo_work, iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}};
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
//
|
||||
// benchmark the protocol implementation
|
||||
//
|
||||
let pg_version = 14;
|
||||
bench_group!(
|
||||
"ping",
|
||||
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
|
||||
let _: () = mgr.ping(pg_version).await.unwrap();
|
||||
})
|
||||
);
|
||||
//
|
||||
// benchmarks with actual record redo
|
||||
//
|
||||
let make_redo_work = |req: &'static Request| {
|
||||
Arc::new(move |mgr: Arc<PostgresRedoManager>| async move {
|
||||
let page = req.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
})
|
||||
};
|
||||
bench_group!("short", {
|
||||
static REQUEST: Lazy<Request> = Lazy::new(Request::short_input);
|
||||
make_redo_work(&REQUEST)
|
||||
});
|
||||
bench_group!("medium", {
|
||||
static REQUEST: Lazy<Request> = Lazy::new(Request::medium_input);
|
||||
make_redo_work(&REQUEST)
|
||||
});
|
||||
}
|
||||
criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl<F, Fut>(redo_work: Arc<F>, n_redos: u64, nclients: u64) -> Duration
|
||||
where
|
||||
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
|
||||
Fut: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
@@ -164,20 +135,17 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
async fn client<F, Fut>(
|
||||
async fn client(
|
||||
mgr: Arc<PostgresRedoManager>,
|
||||
start: Arc<Barrier>,
|
||||
redo_work: Arc<F>,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
) -> Duration
|
||||
where
|
||||
F: Fn(Arc<PostgresRedoManager>) -> Fut + Send + Sync + 'static,
|
||||
Fut: Future<Output = ()> + Send + 'static,
|
||||
{
|
||||
) -> Duration {
|
||||
start.wait().await;
|
||||
let start = Instant::now();
|
||||
for _ in 0..n_redos {
|
||||
redo_work(Arc::clone(&mgr)).await;
|
||||
let page = redo_work.execute(&mgr).await.unwrap();
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
// The real pageserver will rarely if ever do 2 walredos in a row without
|
||||
// yielding to the executor.
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
@@ -432,7 +432,7 @@ impl Client {
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::PUT, &uri, req)
|
||||
self.request(Method::POST, &uri, req)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
|
||||
@@ -324,6 +324,7 @@ impl PageServerConf {
|
||||
max_vectored_read_bytes,
|
||||
image_compression,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
compact_level0_phase1_value_access: _,
|
||||
l0_flush,
|
||||
virtual_file_direct_io,
|
||||
concurrent_tenant_warmup,
|
||||
@@ -534,6 +535,16 @@ mod tests {
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compactl0_phase1_access_mode_is_ignored_silently() {
|
||||
let input = indoc::indoc! {r#"
|
||||
[compact_level0_phase1_value_access]
|
||||
mode = "streaming-kmerge"
|
||||
validate = "key-lsn-value"
|
||||
"#};
|
||||
toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input).unwrap();
|
||||
}
|
||||
|
||||
/// If there's a typo in the pageserver config, we'd rather catch that typo
|
||||
/// and fail pageserver startup than silently ignoring the typo, leaving whoever
|
||||
/// made it in the believe that their config change is effective.
|
||||
|
||||
@@ -2955,7 +2955,7 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
)
|
||||
.put(
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|
||||
|r| api_handler(r, timeline_archival_config_handler),
|
||||
)
|
||||
|
||||
@@ -840,36 +840,6 @@ impl Timeline {
|
||||
Ok(total_size * BLCKSZ as u64)
|
||||
}
|
||||
|
||||
/// Get a KeySpace that covers all the Keys that are in use at AND below the given LSN. This is only used
|
||||
/// for gc-compaction.
|
||||
///
|
||||
/// gc-compaction cannot use the same `collect_keyspace` function as the legacy compaction because it
|
||||
/// processes data at multiple LSNs and needs to be aware of the fact that some key ranges might need to
|
||||
/// be kept only for a specific range of LSN.
|
||||
///
|
||||
/// Consider the case that the user created branches at LSN 10 and 20, where the user created a table A at
|
||||
/// LSN 10 and dropped that table at LSN 20. `collect_keyspace` at LSN 10 will return the key range
|
||||
/// corresponding to that table, while LSN 20 won't. The keyspace info at a single LSN is not enough to
|
||||
/// determine which keys to retain/drop for gc-compaction.
|
||||
///
|
||||
/// For now, it only drops AUX-v1 keys. But in the future, the function will be extended to return the keyspace
|
||||
/// to be retained for each of the branch LSN.
|
||||
///
|
||||
/// The return value is (dense keyspace, sparse keyspace).
|
||||
pub(crate) async fn collect_gc_compaction_keyspace(
|
||||
&self,
|
||||
) -> Result<(KeySpace, SparseKeySpace), CollectKeySpaceError> {
|
||||
let metadata_key_begin = Key::metadata_key_range().start;
|
||||
let aux_v1_key = AUX_FILES_KEY;
|
||||
let dense_keyspace = KeySpace {
|
||||
ranges: vec![Key::MIN..aux_v1_key, aux_v1_key.next()..metadata_key_begin],
|
||||
};
|
||||
Ok((
|
||||
dense_keyspace,
|
||||
SparseKeySpace(KeySpace::single(Key::metadata_key_range())),
|
||||
))
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
//! Common traits and structs for layers
|
||||
|
||||
pub mod delta_layer;
|
||||
pub mod filter_iterator;
|
||||
pub mod image_layer;
|
||||
pub mod inmemory_layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -1,205 +0,0 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
keyspace::{KeySpace, SparseKeySpace},
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
|
||||
///
|
||||
/// The iterator will skip any keys not included in the keyspace filter. In other words, the keyspace filter contains the keys
|
||||
/// to be retained.
|
||||
pub struct FilterIterator<'a> {
|
||||
inner: MergeIterator<'a>,
|
||||
retain_key_filters: Vec<Range<Key>>,
|
||||
current_filter_idx: usize,
|
||||
}
|
||||
|
||||
impl<'a> FilterIterator<'a> {
|
||||
pub fn create(
|
||||
inner: MergeIterator<'a>,
|
||||
dense_keyspace: KeySpace,
|
||||
sparse_keyspace: SparseKeySpace,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut retain_key_filters = Vec::new();
|
||||
retain_key_filters.extend(dense_keyspace.ranges);
|
||||
retain_key_filters.extend(sparse_keyspace.0.ranges);
|
||||
retain_key_filters.sort_by(|a, b| a.start.cmp(&b.start));
|
||||
// Verify key filters are non-overlapping and sorted
|
||||
for window in retain_key_filters.windows(2) {
|
||||
if window[0].end > window[1].start {
|
||||
bail!(
|
||||
"Key filters are overlapping: {:?} and {:?}",
|
||||
window[0],
|
||||
window[1]
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
inner,
|
||||
retain_key_filters,
|
||||
current_filter_idx: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(item) = self.inner.next().await? {
|
||||
while self.current_filter_idx < self.retain_key_filters.len()
|
||||
&& item.0 >= self.retain_key_filters[self.current_filter_idx].end
|
||||
{
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
self.current_filter_idx += 1;
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
if self.current_filter_idx >= self.retain_key_filters.len() {
|
||||
// We already exhausted all filters, so we should return now
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter (nothing)
|
||||
return Ok(None);
|
||||
}
|
||||
if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
return Ok(Some(item));
|
||||
}
|
||||
// If the key is not contained in the key retaining filters, continue to the next item.
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::produce_delta_layer,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
async fn assert_filter_iter_equal(
|
||||
filter_iter: &mut FilterIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
) {
|
||||
let mut expect_iter = expect.iter();
|
||||
loop {
|
||||
let o1 = filter_iter.next().await.unwrap();
|
||||
let o2 = expect_iter.next();
|
||||
assert_eq!(o1.is_some(), o2.is_some());
|
||||
if o1.is_none() && o2.is_none() {
|
||||
break;
|
||||
}
|
||||
let (k1, l1, v1) = o1.unwrap();
|
||||
let (k2, l2, v2) = o2.unwrap();
|
||||
assert_eq!(&k1, k2);
|
||||
assert_eq!(l1, *l2);
|
||||
assert_eq!(&v1, v2);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
const N: usize = 100;
|
||||
let test_deltas1 = (0..N)
|
||||
.map(|idx| {
|
||||
(
|
||||
get_key(idx as u32),
|
||||
Lsn(0x20 * ((idx as u64) % 10 + 1)),
|
||||
Value::Image(Bytes::from(format!("img{idx:05}"))),
|
||||
)
|
||||
})
|
||||
.collect_vec();
|
||||
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(5)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(110),
|
||||
get_key(1000)..get_key(2000),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[5..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..100].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
|
||||
let merge_iter = MergeIterator::create(
|
||||
&[resident_layer_1.get_as_delta(&ctx).await.unwrap()],
|
||||
&[],
|
||||
&ctx,
|
||||
);
|
||||
|
||||
let mut filter_iter = FilterIterator::create(
|
||||
merge_iter,
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
get_key(0)..get_key(10),
|
||||
get_key(20)..get_key(30),
|
||||
get_key(90)..get_key(95),
|
||||
],
|
||||
},
|
||||
SparseKeySpace(KeySpace::default()),
|
||||
)
|
||||
.unwrap();
|
||||
let mut result = Vec::new();
|
||||
result.extend(test_deltas1[0..10].iter().cloned());
|
||||
result.extend(test_deltas1[20..30].iter().cloned());
|
||||
result.extend(test_deltas1[90..95].iter().cloned());
|
||||
assert_filter_iter_equal(&mut filter_iter, &result).await;
|
||||
}
|
||||
}
|
||||
@@ -31,7 +31,6 @@ use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}
|
||||
use crate::page_cache;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::filter_iterator::FilterIterator;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
@@ -1773,7 +1772,6 @@ impl Timeline {
|
||||
gc_cutoff,
|
||||
lowest_retain_lsn
|
||||
);
|
||||
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
@@ -1822,12 +1820,7 @@ impl Timeline {
|
||||
image_layers.push(layer);
|
||||
}
|
||||
}
|
||||
let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
|
||||
let mut merge_iter = FilterIterator::create(
|
||||
MergeIterator::create(&delta_layers, &image_layers, ctx),
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
|
||||
@@ -205,22 +205,6 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a ping request-response roundtrip.
|
||||
///
|
||||
/// Not used in production, but by Rust benchmarks.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
|
||||
self.do_with_walredo_process(pg_version, |proc| async move {
|
||||
proc.ping(Duration::from_secs(1))
|
||||
.await
|
||||
.map_err(Error::Other)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub fn status(&self) -> WalRedoManagerStatus {
|
||||
WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
@@ -313,9 +297,6 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancel-safe iff `closure` is cancel-safe.
|
||||
async fn do_with_walredo_process<
|
||||
F: FnOnce(Arc<Process>) -> Fut,
|
||||
Fut: Future<Output = Result<O, Error>>,
|
||||
@@ -556,17 +537,6 @@ mod tests {
|
||||
use tracing::Instrument;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ping() {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
h.manager
|
||||
.ping(14)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.expect("ping should work");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn short_v14_redo() {
|
||||
let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();
|
||||
|
||||
@@ -6,7 +6,6 @@ use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
page_cache::PAGE_SZ,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
@@ -238,26 +237,6 @@ impl WalRedoProcess {
|
||||
res
|
||||
}
|
||||
|
||||
/// Do a ping request-response roundtrip.
|
||||
///
|
||||
/// Not used in production, but by Rust benchmarks.
|
||||
pub(crate) async fn ping(&self, timeout: Duration) -> anyhow::Result<()> {
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity(4);
|
||||
protocol::build_ping_msg(&mut writebuf);
|
||||
let Ok(res) = tokio::time::timeout(timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo ping timed out");
|
||||
};
|
||||
let response = res?;
|
||||
if response.len() != PAGE_SZ {
|
||||
anyhow::bail!(
|
||||
"WAL redo ping response should respond with page-sized response: {}",
|
||||
response.len()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
|
||||
@@ -55,8 +55,3 @@ pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
pub(crate) fn build_ping_msg(buf: &mut Vec<u8>) {
|
||||
buf.put_u8(b'H');
|
||||
buf.put_u32(4);
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -1263,7 +1263,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
|
||||
int32 dc;
|
||||
time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration, 1.0);
|
||||
LWLockRelease(lfc_lock);
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
@@ -1280,7 +1280,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
int32 dc;
|
||||
bool reset = PG_GETARG_BOOL(0);
|
||||
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1, 1.0);
|
||||
if (reset)
|
||||
memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
|
||||
LWLockRelease(lfc_lock);
|
||||
@@ -1288,3 +1288,21 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
}
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(approximate_optimal_cache_size);
|
||||
|
||||
Datum
|
||||
approximate_optimal_cache_size(PG_FUNCTION_ARGS)
|
||||
{
|
||||
if (lfc_size_limit != 0)
|
||||
{
|
||||
int32 dc;
|
||||
time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
|
||||
double min_hit_ratio = PG_ARGISNULL(1) ? 1.0 : PG_GETARG_FLOAT8(1);
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration, min_hit_ratio);
|
||||
LWLockRelease(lfc_lock);
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
|
||||
*
|
||||
* Implements https://hal.science/hal-00465313/document
|
||||
*
|
||||
*
|
||||
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally
|
||||
* suited to estimating the cardinality of very large sets; in particular, we
|
||||
* have not attempted to further optimize the implementation as described in
|
||||
@@ -126,22 +126,69 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
|
||||
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
|
||||
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
|
||||
|
||||
cState->regs[index][count] = now;
|
||||
if (cState->regs[index][count].ts)
|
||||
{
|
||||
/* update histgoram */
|
||||
int64_t delta = (now - cState->regs[index][count].ts)/USECS_PER_SEC;
|
||||
uint32_t new_histogram[HIST_SIZE] = {0};
|
||||
for (int i = 0; i < HIST_SIZE; i++) {
|
||||
/* Use middle point of interval */
|
||||
uint32 interval_log2 = pg_ceil_log2_32((delta + (HIST_MIN_INTERVAL*((1<<i) + ((1<<i)/2))/2)) / HIST_MIN_INTERVAL);
|
||||
uint32 cell = Min(interval_log2, HIST_SIZE-1);
|
||||
new_histogram[cell] += cState->regs[index][count].histogram[i];
|
||||
}
|
||||
memcpy(cState->regs[index][count].histogram, new_histogram, sizeof new_histogram);
|
||||
}
|
||||
cState->regs[index][count].ts = now;
|
||||
cState->regs[index][count].histogram[0] += 1; // most recent access always goes to first histogram backet
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
getAccessCount(const HyperLogLogRegister* reg, time_t duration)
|
||||
{
|
||||
uint32_t count = 0;
|
||||
/* Simplest solution is to take in account all points fro overlapped interval */
|
||||
for (size_t i = 0; i < HIST_SIZE && HIST_MIN_INTERVAL*((1 << i)/2) <= duration; i++) {
|
||||
count += reg->histogram[i];
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
static uint8
|
||||
getMaximum(const TimestampTz* reg, TimestampTz since)
|
||||
getMaximum(const HyperLogLogRegister* reg, TimestampTz since, time_t duration, double min_hit_ratio)
|
||||
{
|
||||
uint8 max = 0;
|
||||
|
||||
for (size_t i = 0; i < HLL_C_BITS + 1; i++)
|
||||
size_t i, j;
|
||||
if (min_hit_ratio == 1.0)
|
||||
{
|
||||
if (reg[i] >= since)
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
max = i;
|
||||
if (reg[i].ts >= since)
|
||||
{
|
||||
max = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t total_count = 0;
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
total_count += getAccessCount(®[i], duration);
|
||||
}
|
||||
if (total_count != 0)
|
||||
{
|
||||
const double threshold = total_count * (1 - min_hit_ratio);
|
||||
for (i = 0; i < HLL_C_BITS + 1; i++)
|
||||
{
|
||||
// Take in account only bits with access frequncy exceeding maximal miss rate (1 - hit rate)
|
||||
if (reg[i].ts >= since && getAccessCount(®[i], duration) >= threshold)
|
||||
{
|
||||
max = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return max;
|
||||
}
|
||||
|
||||
@@ -150,7 +197,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since)
|
||||
* Estimates cardinality, based on elements added so far
|
||||
*/
|
||||
double
|
||||
estimateSHLL(HyperLogLogState *cState, time_t duration)
|
||||
estimateSHLL(HyperLogLogState *cState, time_t duration, double min_hit_ratio)
|
||||
{
|
||||
double result;
|
||||
double sum = 0.0;
|
||||
@@ -161,7 +208,7 @@ estimateSHLL(HyperLogLogState *cState, time_t duration)
|
||||
|
||||
for (i = 0; i < HLL_N_REGISTERS; i++)
|
||||
{
|
||||
R[i] = getMaximum(cState->regs[i], since);
|
||||
R[i] = getMaximum(cState->regs[i], since, duration, min_hit_ratio);
|
||||
sum += 1.0 / pow(2.0, R[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,14 @@
|
||||
#define HLL_C_BITS (32 - HLL_BIT_WIDTH)
|
||||
#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
|
||||
|
||||
/*
|
||||
* Number of histogram cells. We use exponential histogram with first interval
|
||||
* equals to one minutes. Autoscaler request LFC statistic with intervals 1,2,...,60 minutes
|
||||
* so 2^8=64 seems to be enough for our needs.
|
||||
*/
|
||||
#define HIST_SIZE 8
|
||||
#define HIST_MIN_INTERVAL 60 /* seconds */
|
||||
|
||||
/*
|
||||
* HyperLogLog is an approximate technique for computing the number of distinct
|
||||
* entries in a set. Importantly, it does this by using a fixed amount of
|
||||
@@ -69,18 +77,21 @@
|
||||
* modified timestamp >= the query timestamp. This value is the number of bits
|
||||
* for this register in the normal HLL calculation.
|
||||
*
|
||||
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
|
||||
* Usage could be halved if we decide to reduce the required time dimension
|
||||
* precision; as 32 bits in second precision should be enough for statistics.
|
||||
* However, that is not yet implemented.
|
||||
* The memory usage is 2^B * (C + 1) * sizeof(HyperLogLogRegister), or 920kiB.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
TimestampTz ts; /* last access timestamp */
|
||||
uint32_t histogram[HIST_SIZE]; /* access counter exponential histogram */
|
||||
} HyperLogLogRegister;
|
||||
|
||||
typedef struct HyperLogLogState
|
||||
{
|
||||
TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
|
||||
HyperLogLogRegister regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
|
||||
} HyperLogLogState;
|
||||
|
||||
extern void initSHLL(HyperLogLogState *cState);
|
||||
extern void addSHLL(HyperLogLogState *cState, uint32 hash);
|
||||
extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
|
||||
extern double estimateSHLL(HyperLogLogState *cState, time_t dutration, double min_hit_ratio);
|
||||
|
||||
#endif
|
||||
|
||||
10
pgxn/neon/neon--1.4--1.5.sql
Normal file
10
pgxn/neon/neon--1.4--1.5.sql
Normal file
@@ -0,0 +1,10 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.5'" to load this file. \quit
|
||||
|
||||
-- returns minimal LFC cache size (in 8kb pages) provided specified hit rate
|
||||
CREATE FUNCTION approximate_optimal_cache_size(duration_sec integer default null, min_hit_ration float8 default null)
|
||||
RETURNS integer
|
||||
AS 'MODULE_PATHNAME', 'approximate_optimal_cache_size'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
GRANT EXECUTE ON FUNCTION approximate_optimal_cache_size(integer,float8) TO pg_monitor;
|
||||
|
||||
1
pgxn/neon/neon--1.5--1.4.sql
Normal file
1
pgxn/neon/neon--1.5--1.4.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP FUNCTION IF EXISTS approximate_optimal_cache_size(integer,float8) CASCADE;
|
||||
@@ -24,7 +24,6 @@
|
||||
* PushPage ('P'): Copy a page image (in the payload) to buffer cache
|
||||
* ApplyRecord ('A'): Apply a WAL record (in the payload)
|
||||
* GetPage ('G'): Return a page image from buffer cache.
|
||||
* Ping ('H'): Return the input message.
|
||||
*
|
||||
* Currently, you only get a response to GetPage requests; the response is
|
||||
* simply a 8k page, without any headers. Errors are logged to stderr.
|
||||
@@ -134,7 +133,6 @@ static void ApplyRecord(StringInfo input_message);
|
||||
static void apply_error_callback(void *arg);
|
||||
static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
|
||||
static void GetPage(StringInfo input_message);
|
||||
static void Ping(StringInfo input_message);
|
||||
static ssize_t buffered_read(void *buf, size_t count);
|
||||
static void CreateFakeSharedMemoryAndSemaphores();
|
||||
|
||||
@@ -396,10 +394,6 @@ WalRedoMain(int argc, char *argv[])
|
||||
GetPage(&input_message);
|
||||
break;
|
||||
|
||||
case 'H': /* Ping */
|
||||
Ping(&input_message);
|
||||
break;
|
||||
|
||||
/*
|
||||
* EOF means we're done. Perform normal shutdown.
|
||||
*/
|
||||
@@ -1063,36 +1057,6 @@ GetPage(StringInfo input_message)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
Ping(StringInfo input_message)
|
||||
{
|
||||
int tot_written;
|
||||
/* Response: the input message */
|
||||
tot_written = 0;
|
||||
do {
|
||||
ssize_t rc;
|
||||
/* We don't need alignment, but it's bad practice to use char[BLCKSZ] */
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
static const PGIOAlignedBlock response;
|
||||
#else
|
||||
static const PGAlignedBlock response;
|
||||
#endif
|
||||
rc = write(STDOUT_FILENO, &response.data[tot_written], BLCKSZ - tot_written);
|
||||
if (rc < 0) {
|
||||
/* If interrupted by signal, just retry */
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to stdout: %m")));
|
||||
}
|
||||
tot_written += rc;
|
||||
} while (tot_written < BLCKSZ);
|
||||
|
||||
elog(TRACE, "Page sent back for ping");
|
||||
}
|
||||
|
||||
|
||||
/* Buffer used by buffered_read() */
|
||||
static char stdin_buf[16 * 1024];
|
||||
static size_t stdin_len = 0; /* # of bytes in buffer */
|
||||
|
||||
@@ -71,37 +71,6 @@ impl ComputeHookTenant {
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sharded(&self) -> bool {
|
||||
matches!(self, ComputeHookTenant::Sharded(_))
|
||||
}
|
||||
|
||||
/// Clear compute hook state for the specified shard.
|
||||
/// Only valid for [`ComputeHookTenant::Sharded`] instances.
|
||||
fn remove_shard(&mut self, tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize) {
|
||||
match self {
|
||||
ComputeHookTenant::Sharded(sharded) => {
|
||||
if sharded.stripe_size != stripe_size
|
||||
|| sharded.shard_count != tenant_shard_id.shard_count
|
||||
{
|
||||
tracing::warn!("Shard split detected while handling detach")
|
||||
}
|
||||
|
||||
let shard_idx = sharded.shards.iter().position(|(shard_number, _node_id)| {
|
||||
*shard_number == tenant_shard_id.shard_number
|
||||
});
|
||||
|
||||
if let Some(shard_idx) = shard_idx {
|
||||
sharded.shards.remove(shard_idx);
|
||||
} else {
|
||||
tracing::warn!("Shard not found while handling detach")
|
||||
}
|
||||
}
|
||||
ComputeHookTenant::Unsharded(_) => {
|
||||
unreachable!("Detach of unsharded tenants is handled externally");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set one shard's location. If stripe size or shard count have changed, Self is reset
|
||||
/// and drops existing content.
|
||||
fn update(
|
||||
@@ -645,36 +614,6 @@ impl ComputeHook {
|
||||
self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Reflect a detach for a particular shard in the compute hook state.
|
||||
///
|
||||
/// The goal is to avoid sending compute notifications with stale information (i.e.
|
||||
/// including detach pageservers).
|
||||
#[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(super) fn handle_detach(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
stripe_size: ShardStripeSize,
|
||||
) {
|
||||
use std::collections::hash_map::Entry;
|
||||
|
||||
let mut state_locked = self.state.lock().unwrap();
|
||||
match state_locked.entry(tenant_shard_id.tenant_id) {
|
||||
Entry::Vacant(_) => {
|
||||
tracing::warn!("Compute hook tenant not found for detach");
|
||||
}
|
||||
Entry::Occupied(mut e) => {
|
||||
let sharded = e.get().is_sharded();
|
||||
if !sharded {
|
||||
e.remove();
|
||||
} else {
|
||||
e.get_mut().remove_shard(tenant_shard_id, stripe_size);
|
||||
}
|
||||
|
||||
tracing::debug!("Compute hook handled shard detach");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1849,7 +1849,7 @@ pub fn make_router(
|
||||
RequestName("v1_tenant_timeline"),
|
||||
)
|
||||
})
|
||||
.put(
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
|
||||
@@ -238,7 +238,7 @@ impl PageserverClient {
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"timeline_archival_config",
|
||||
crate::metrics::Method::Put,
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.timeline_archival_config(tenant_shard_id, timeline_id, req)
|
||||
|
||||
@@ -820,16 +820,6 @@ impl Reconciler {
|
||||
self.location_config(&node, conf, None, false).await?;
|
||||
}
|
||||
|
||||
// The condition below identifies a detach. We must have no attached intent and
|
||||
// must have been attached to something previously. Pass this information to
|
||||
// the [`ComputeHook`] such that it can update its tenant-wide state.
|
||||
if self.intent.attached.is_none() && !self.detach.is_empty() {
|
||||
// TODO: Consider notifying control plane about detaches. This would avoid situations
|
||||
// where the compute tries to start-up with a stale set of pageservers.
|
||||
self.compute_hook
|
||||
.handle_detach(self.tenant_shard_id, self.shard.stripe_size);
|
||||
}
|
||||
|
||||
failpoint_support::sleep_millis_async!("sleep-on-reconcile-epilogue");
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -5272,7 +5272,7 @@ impl Service {
|
||||
}
|
||||
AvailabilityTransition::ToActive => {
|
||||
tracing::info!("Node {} transition to active", node_id);
|
||||
// When a node comes back online, we must reconcile any non-detached tenant that has a None observed
|
||||
// When a node comes back online, we must reconcile any tenant that has a None observed
|
||||
// location on the node.
|
||||
for tenant_shard in locked.tenants.values_mut() {
|
||||
// If a reconciliation is already in progress, rely on the previous scheduling
|
||||
@@ -5282,9 +5282,7 @@ impl Service {
|
||||
}
|
||||
|
||||
if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
|
||||
if observed_loc.conf.is_none()
|
||||
&& !matches!(tenant_shard.policy, PlacementPolicy::Detached)
|
||||
{
|
||||
if observed_loc.conf.is_none() {
|
||||
self.maybe_reconcile_shard(tenant_shard, &new_nodes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -631,7 +631,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
log.info(
|
||||
f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
|
||||
)
|
||||
res = self.put(
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
|
||||
json=config,
|
||||
)
|
||||
|
||||
@@ -114,3 +114,46 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
|
||||
|
||||
assert estimation_1k >= 20 and estimation_1k <= 40
|
||||
assert estimation_10k >= 200 and estimation_10k <= 400
|
||||
|
||||
|
||||
def test_optimal_cache_size_approximation(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
config_lines=[
|
||||
"autovacuum = off",
|
||||
"shared_buffers=1MB",
|
||||
"neon.max_file_cache_size=256MB",
|
||||
"neon.file_cache_size_limit=245MB",
|
||||
],
|
||||
)
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("create extension neon version '1.5'")
|
||||
cur.execute(
|
||||
"create table t_huge(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
|
||||
)
|
||||
cur.execute(
|
||||
"create table t_small(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
|
||||
)
|
||||
cur.execute(
|
||||
"insert into t_huge(pk) values (generate_series(1,1000000))"
|
||||
) # table size is 21277 pages
|
||||
cur.execute(
|
||||
"insert into t_small(pk) values (generate_series(1,100000))"
|
||||
) # table size is 2128 pages
|
||||
time.sleep(2)
|
||||
before = time.monotonic()
|
||||
for _ in range(100):
|
||||
cur.execute("select sum(count) from t_small")
|
||||
cur.execute("select sum(count) from t_huge")
|
||||
after = time.monotonic()
|
||||
cur.execute(f"select approximate_working_set_size_seconds({int(after - before + 1)})")
|
||||
ws_estimation = cur.fetchall()[0][0]
|
||||
log.info(f"Working set size estimaton {ws_estimation}")
|
||||
cur.execute(f"select approximate_optimal_cache_size({int(after - before + 1)}, 0.99)")
|
||||
optimal_cache_size = cur.fetchall()[0][0]
|
||||
log.info(f"Optimal cache size for 99% hit rate {optimal_cache_size}")
|
||||
assert ws_estimation >= 20000 and ws_estimation <= 30000
|
||||
assert optimal_cache_size >= 2000 and optimal_cache_size <= 3000
|
||||
|
||||
Reference in New Issue
Block a user