From 48b22bd057070f10fda9e4c8a7f5326441617b22 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 20 Mar 2024 15:01:14 +0000 Subject: [PATCH 1/4] walredo: better benchmark --- pageserver/benches/bench_walredo.rs | 317 +++++++++++++--------------- 1 file changed, 144 insertions(+), 173 deletions(-) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 47c8bd75c6..72c0f2c6c0 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -1,160 +1,128 @@ -//! Simple benchmarking around walredo. +//! Quantify a single walredo manager's throughput under N concurrent callers. //! -//! Right now they hope to just set a baseline. Later we can try to expand into latency and -//! throughput after figuring out the coordinated omission problems below. +//! The benchmark implementation ([`bench_impl`]) is parametrized by +//! - `redo_work` => [`Request::short_request`] or [`Request::medium_request`] +//! - `n_redos` => number of times the benchmark shell execute the `redo_work` +//! - `nclients` => number of clients (more on this shortly). //! -//! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by -//! logging what happens when a sequential scan is requested on a small table, then picking out two -//! suitable from logs. +//! The benchmark impl sets up a multi-threaded tokio runtime with default parameters. +//! It spawns `nclients` times [`client`] tokio tasks. +//! Each task executes the `redo_work` `n_redos/nclients` times. //! +//! We exercise the following combinations: +//! - `redo_work = short / medium`` +//! - `nclients = [1, 2, 4, 8, 16, 32, 64, 128]` //! -//! Reference data (git blame to see commit) on an i3en.3xlarge -// ```text -//! short/short/1 time: [39.175 µs 39.348 µs 39.536 µs] -//! short/short/2 time: [51.227 µs 51.487 µs 51.755 µs] -//! short/short/4 time: [76.048 µs 76.362 µs 76.674 µs] -//! short/short/8 time: [128.94 µs 129.82 µs 130.74 µs] -//! short/short/16 time: [227.84 µs 229.00 µs 230.28 µs] -//! short/short/32 time: [455.97 µs 457.81 µs 459.90 µs] -//! short/short/64 time: [902.46 µs 904.84 µs 907.32 µs] -//! short/short/128 time: [1.7416 ms 1.7487 ms 1.7561 ms] -//! `` - -use std::sync::Arc; +//! We let `criterion` determine the `n_redos` using `iter_custom`. +//! The idea is that for each `(redo_work, nclients)` combination, +//! criterion will run the `bench_impl` multiple times with different `n_redos`. +//! The `bench_impl` reports the aggregate wall clock time from the clients' perspective. +//! Criterion will divide that by `n_redos` to compute the "time per iteration". +//! In our case, "time per iteration" means "time per redo_work execution". +//! +//! NB: the way by which `iter_custom` determines the "number of iterations" +//! is called sampling. Apparently the idea here is to detect outliers. +//! We're not sure whether the current choice of sampling method makes sense. +//! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples use bytes::{Buf, Bytes}; -use pageserver::{ - config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager, +use criterion::{BenchmarkId, Criterion}; +use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver_api::{key::Key, shard::TenantShardId}; +use std::{ + sync::Arc, + time::{Duration, Instant}, }; -use pageserver_api::shard::TenantShardId; -use tokio::task::JoinSet; +use tokio::{sync::Barrier, task::JoinSet}; use utils::{id::TenantId, lsn::Lsn}; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +fn bench(c: &mut Criterion) { + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("short"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::short_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } -fn redo_scenarios(c: &mut Criterion) { - // logging should be enabled when adding more inputs, since walredo will only report malformed - // input to the stderr. - // utils::logging::init(utils::logging::LogFormat::Plain).unwrap(); + { + let nclients = [1, 2, 4, 8, 16, 32, 64, 128]; + for nclients in nclients { + let mut group = c.benchmark_group("medium"); + group.bench_with_input( + BenchmarkId::from_parameter(nclients), + &nclients, + |b, nclients| { + let redo_work = Arc::new(Request::medium_input()); + b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients)); + }, + ); + } + } +} +criterion::criterion_group!(benches, bench); +criterion::criterion_main!(benches); +// Returns the sum of each client's wall-clock time spent executing their share of the n_redos. +fn bench_impl(redo_work: Arc, n_redos: u64, nclients: u64) -> Duration { let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap(); let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf()); let conf = Box::leak(Box::new(conf)); let tenant_shard_id = TenantShardId::unsharded(TenantId::generate()); - let manager = PostgresRedoManager::new(conf, tenant_shard_id); - - let manager = Arc::new(manager); - - { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - tracing::info!("executing first"); - rt.block_on(short().execute(&manager)).unwrap(); - tracing::info!("first executed"); - } - - let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128]; - - let mut group = c.benchmark_group("short"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("short", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, short); - }, - ); - } - drop(group); - - let mut group = c.benchmark_group("medium"); - group.sampling_mode(criterion::SamplingMode::Flat); - - for thread_count in thread_counts { - group.bench_with_input( - BenchmarkId::new("medium", thread_count), - &thread_count, - |b, thread_count| { - add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium); - }, - ); - } - drop(group); -} - -/// Sets up a multi-threaded tokio runtime with default worker thread count, -/// then, spawn `requesters` tasks that repeatedly: -/// - get input from `input_factor()` -/// - call `manager.request_redo()` with their input -/// -/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency. -/// -/// Using tokio's default worker thread count means the results will differ on machines -/// with different core countrs. We don't care about that, the performance will always -/// be different on different hardware. To compare performance of different software versions, -/// use the same hardware. -fn add_multithreaded_walredo_requesters( - b: &mut criterion::Bencher, - nrequesters: usize, - manager: &Arc, - input_factory: fn() -> Request, -) { - assert_ne!(nrequesters, 0); - let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build() .unwrap(); - let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1)); + let start = Arc::new(Barrier::new(nclients as usize)); - let mut requesters = JoinSet::new(); - for _ in 0..nrequesters { - let _entered = rt.enter(); - let manager = manager.clone(); - let barrier = barrier.clone(); - requesters.spawn(async move { - loop { - let input = input_factory(); - barrier.wait().await; - let page = input.execute(&manager).await.unwrap(); - assert_eq!(page.remaining(), 8192); - barrier.wait().await; - } + let mut tasks = JoinSet::new(); + + let manager = PostgresRedoManager::new(conf, tenant_shard_id); + let manager = Arc::new(manager); + + for _ in 0..nclients { + rt.block_on(async { + tasks.spawn(client( + Arc::clone(&manager), + Arc::clone(&start), + Arc::clone(&redo_work), + // divide the amount of work equally among the clients + n_redos / nclients, + )) }); } - let do_one_iteration = || { - rt.block_on(async { - barrier.wait().await; - // wait for work to complete - barrier.wait().await; - }) - }; - - b.iter_batched( - || { - // warmup - do_one_iteration(); - }, - |()| { - // work loop - do_one_iteration(); - }, - criterion::BatchSize::PerIteration, - ); - - rt.block_on(requesters.shutdown()); + let mut total_wallclock_time = std::time::Duration::from_millis(0); + while let Some(res) = rt.block_on(tasks.join_next()) { + total_wallclock_time += res.unwrap(); + } + total_wallclock_time } -criterion_group!(benches, redo_scenarios); -criterion_main!(benches); +async fn client( + mgr: Arc, + start: Arc, + redo_work: Arc, + n_redos: u64, +) -> Duration { + start.wait().await; + let start = Instant::now(); + for _ in 0..n_redos { + let page = redo_work.execute(&mgr).await.unwrap(); + assert_eq!(page.remaining(), 8192); + } + start.elapsed() +} macro_rules! lsn { ($input:expr) => {{ @@ -166,12 +134,46 @@ macro_rules! lsn { }}; } -/// Short payload, 1132 bytes. -// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 -// for null bytes. -#[allow(clippy::octal_escapes)] -fn short() -> Request { - Request { +/// Simple wrapper around `WalRedoManager::request_redo`. +/// +/// In benchmarks this is cloned around. +#[derive(Clone)] +struct Request { + key: Key, + lsn: Lsn, + base_img: Option<(Lsn, Bytes)>, + records: Vec<(Lsn, NeonWalRecord)>, + pg_version: u32, +} + +impl Request { + async fn execute(&self, manager: &PostgresRedoManager) -> anyhow::Result { + let Request { + key, + lsn, + base_img, + records, + pg_version, + } = self; + + // TODO: avoid these clones + manager + .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .await + } + + fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { + let rec = Bytes::from_static(bytes); + NeonWalRecord::Postgres { will_init, rec } + } + + /// Short payload, 1132 bytes. + // pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0 + // for null bytes. + #[allow(clippy::octal_escapes)] + pub fn short_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -194,13 +196,14 @@ fn short() -> Request { ], pg_version: 14, } -} + } -/// Medium sized payload, serializes as 26393 bytes. -// see [`short`] -#[allow(clippy::octal_escapes)] -fn medium() -> Request { - Request { + /// Medium sized payload, serializes as 26393 bytes. + // see [`short`] + #[allow(clippy::octal_escapes)] + pub fn medium_input() -> Request { + let pg_record = Self::pg_record; + Request { key: Key { field1: 0, field2: 1663, @@ -442,37 +445,5 @@ fn medium() -> Request { ], pg_version: 14, } -} - -fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { - let rec = Bytes::from_static(bytes); - NeonWalRecord::Postgres { will_init, rec } -} - -/// Simple wrapper around `WalRedoManager::request_redo`. -/// -/// In benchmarks this is cloned around. -#[derive(Clone)] -struct Request { - key: Key, - lsn: Lsn, - base_img: Option<(Lsn, Bytes)>, - records: Vec<(Lsn, NeonWalRecord)>, - pg_version: u32, -} - -impl Request { - async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result { - let Request { - key, - lsn, - base_img, - records, - pg_version, - } = self; - - manager - .request_redo(key, lsn, base_img, records, pg_version) - .await } } From 3d16cda846f4e7b8e929c61db13093586dca93d1 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Wed, 20 Mar 2024 18:03:09 +0200 Subject: [PATCH 2/4] refactor(layer): use detached init (#7152) The second part of work towards fixing `Layer::keep_resident` so that it does not need to repair the internal state. #7135 added a nicer API for initialization. This PR uses it to remove a few indentation levels and the loop construction. The next PR #7175 will use the refactorings done in this PR, and always initialize the internal state after a download. Cc: #5331 --- pageserver/src/tenant/storage_layer/layer.rs | 329 +++++++++--------- .../src/tenant/storage_layer/layer/tests.rs | 3 + 2 files changed, 166 insertions(+), 166 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index f37d7e6449..eed423c3e6 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -702,181 +702,132 @@ impl LayerInner { allow_download: bool, ctx: Option<&RequestContext>, ) -> Result, DownloadError> { - let mut init_permit = None; + let (weak, permit) = { + let locked = self + .inner + .get_or_init_detached() + .await + .map(|mut guard| guard.get_and_upgrade().ok_or(guard)); - loop { - let download = move |permit| { - async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + match locked { + // this path could had been a RwLock::read + Ok(Ok((strong, upgraded))) if !upgraded => return Ok(strong), + Ok(Ok((strong, _))) => { + // when upgraded back, the Arc is still available, but + // previously a `evict_and_wait` was received. + self.wanted_evicted.store(false, Ordering::Relaxed); - // no need to make the evict_and_wait wait for the actual download to complete + // error out any `evict_and_wait` drop(self.status.send(Status::Downloaded)); - - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; - - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - - let can_ever_evict = timeline.remote_client.as_ref().is_some(); - - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed); - - let needs_download = match needs_download { - Ok(reason) => reason, - Err(e) => { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - }; - - let (permit, downloaded) = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); - } - - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); - - if !can_ever_evict { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::NoRemoteStorage); - } - - if let Some(ctx) = ctx { - let res = self.check_expected_download(ctx); - if let Err(e) = res { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - } - - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - let permit = self.spawn_download_and_wait(timeline, permit).await; - - let permit = match permit { - Ok(permit) => permit, - Err(e) => { - scopeguard::ScopeGuard::into_inner(init_cancelled); - return Err(e); - } - }; - - (permit, true) - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - (permit, false) - }; - - scopeguard::ScopeGuard::into_inner(init_cancelled); - - if downloaded { - let since_last_eviction = self - .last_evicted_at - .lock() - .unwrap() - .take() - .map(|ts| ts.elapsed()); - - if let Some(since_last_eviction) = since_last_eviction { - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); - } - } - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!( - waiters, - "completing the on-demand download for other tasks" - ); - } - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) - } - .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) - }; - - if let Some(init_permit) = init_permit.take() { - // use the already held initialization permit because it is impossible to hit the - // below paths anymore essentially limiting the max loop iterations to 2. - let (value, init_permit) = download(init_permit).await?; - let mut guard = self.inner.set(value, init_permit); - let (strong, _upgraded) = guard - .get_and_upgrade() - .expect("init creates strong reference, we held the init permit"); - return Ok(strong); - } - - let (weak, permit) = { - let mut locked = self.inner.get_or_init(download).await?; - - if let Some((strong, upgraded)) = locked.get_and_upgrade() { - if upgraded { - // when upgraded back, the Arc is still available, but - // previously a `evict_and_wait` was received. - self.wanted_evicted.store(false, Ordering::Relaxed); - - // error out any `evict_and_wait` - drop(self.status.send(Status::Downloaded)); - LAYER_IMPL_METRICS - .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); - } + LAYER_IMPL_METRICS + .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess); return Ok(strong); - } else { + } + Ok(Err(mut guard)) => { // path to here: the evict_blocking is stuck on spawn_blocking queue. // // reset the contents, deactivating the eviction and causing a // EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed. - locked.take_and_deinit() + let (weak, permit) = guard.take_and_deinit(); + (Some(weak), permit) } - }; - - // unlock first, then drop the weak, but because upgrade failed, we - // know it cannot be a problem. + Err(permit) => (None, permit), + } + }; + if let Some(weak) = weak { + // only drop the weak after dropping the heavier_once_cell guard assert!( matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)), "unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug" ); - - init_permit = Some(permit); - - LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download(); } + + async move { + // disable any scheduled but not yet running eviction deletions for this + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + + // no need to make the evict_and_wait wait for the actual download to complete + drop(self.status.send(Status::Downloaded)); + + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; + + // count cancellations, which currently remain largely unexpected + let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + + // check if we really need to be downloaded; could have been already downloaded by a + // cancelled previous attempt. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed); + + let needs_download = match needs_download { + Ok(reason) => reason, + Err(e) => { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } + }; + + let Some(reason) = needs_download else { + scopeguard::ScopeGuard::into_inner(init_cancelled); + + // the file is present locally, probably by a previous but cancelled call to + // get_or_maybe_download. alternatively we might be running without remote storage. + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + let res = self.initialize_after_layer_is_on_disk(next_version, permit, false); + return Ok(res); + }; + + if let NeedsDownload::NotFile(ft) = reason { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(DownloadError::NotFile(ft)); + } + + // only reset this after we've decided we really need to download. otherwise it'd + // be impossible to mark cancelled downloads for eviction, like one could imagine + // we would like to do for prefetching which was not needed. + self.wanted_evicted.store(false, Ordering::Release); + + if timeline.remote_client.as_ref().is_none() { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(DownloadError::NoRemoteStorage); + } + + if let Some(ctx) = ctx { + let res = self.check_expected_download(ctx); + if let Err(e) = res { + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(e); + } + } + + if !allow_download { + // this does look weird, but for LayerInner the "downloading" means also changing + // internal once related state ... + scopeguard::ScopeGuard::into_inner(init_cancelled); + return Err(DownloadError::DownloadRequired); + } + + tracing::info!(%reason, "downloading on-demand"); + + let permit = self.spawn_download_and_wait(timeline, permit).await; + + scopeguard::ScopeGuard::into_inner(init_cancelled); + + let permit = permit?; + + let res = self.initialize_after_layer_is_on_disk(next_version, permit, true); + Ok(res) + } + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) + .await } /// Nag or fail per RequestContext policy @@ -1026,6 +977,59 @@ impl LayerInner { } } + /// Initializes the `Self::inner` to a "resident" state. + /// + /// Callers are assumed to ensure that the file is actually on disk with `Self::needs_download` + /// before calling this method. + /// + /// If this method is ever made async, it needs to be cancellation safe so that no state + /// changes are made before we can write to the OnceCell in non-cancellable fashion. + fn initialize_after_layer_is_on_disk( + self: &Arc, + next_version: usize, + permit: heavier_once_cell::InitPermit, + downloaded: bool, + ) -> Arc { + debug_assert_current_span_has_tenant_and_timeline_id(); + + if downloaded { + let since_last_eviction = self + .last_evicted_at + .lock() + .unwrap() + .take() + .map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + // FIXME: this will not always be recorded correctly until #6028 (the no + // download needed branch above) + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + } + } + + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); + + // FIXME: this might now be double-accounted for !downloaded + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!(waiters, "completing the on-demand download for other tasks"); + } + + let value = ResidentOrWantedEvicted::Resident(res.clone()); + + self.inner.set(value, permit); + + res + } + async fn needs_download(&self) -> Result, std::io::Error> { match tokio::fs::metadata(&self.path).await { Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()), @@ -1690,11 +1694,6 @@ impl LayerImplMetrics { self.rare_counters[RareEvent::RemoveOnDropFailed].inc(); } - /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`. - fn inc_retried_get_or_maybe_download(&self) { - self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc(); - } - /// Expected rare because cancellations are unexpected, and failures are unexpected fn inc_download_failed_without_requester(&self) { self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc(); @@ -1779,7 +1778,6 @@ impl DeleteFailed { #[derive(enum_map::Enum)] enum RareEvent { RemoveOnDropFailed, - RetriedGetOrMaybeDownload, DownloadFailedWithoutRequester, UpgradedWantedEvicted, InitWithoutDownload, @@ -1793,7 +1791,6 @@ impl RareEvent { match self { RemoveOnDropFailed => "remove_on_drop_failed", - RetriedGetOrMaybeDownload => "retried_gomd", DownloadFailedWithoutRequester => "download_failed_without", UpgradedWantedEvicted => "raced_wanted_evicted", InitWithoutDownload => "init_needed_no_download", diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index b43534efd4..e7b2eb025a 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -254,6 +254,8 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { let h = TenantHarness::create("residency_check_while_evict_and_wait_on_clogged_spawn_blocking") .unwrap(); let (tenant, ctx) = h.load().await; + let span = h.span(); + let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1)); let timeline = tenant .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx) @@ -292,6 +294,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() { // because no actual eviction happened, we get to just reinitialize the DownloadedLayer layer .keep_resident() + .instrument(download_span) .await .expect("keep_resident should had reinitialized without downloading") .expect("ResidentLayer"); From 929423cf68b7cbef1edd27c24523460d5e3ac9a8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 20 Mar 2024 18:04:14 +0000 Subject: [PATCH 3/4] add i3en.3xlarge reference numbers --- pageserver/benches/bench_walredo.rs | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 72c0f2c6c0..0d0e940bd5 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -24,6 +24,29 @@ //! is called sampling. Apparently the idea here is to detect outliers. //! We're not sure whether the current choice of sampling method makes sense. //! See https://bheisler.github.io/criterion.rs/book/user_guide/command_line_output.html#collecting-samples +//! +//! # Reference Numbers +//! +//! 2024-03-20 on i3en.3xlarge +//! +//! ```text +//! short/1 time: [21.612 µs 21.707 µs 21.817 µs] +//! short/2 time: [27.216 µs 27.372 µs 27.557 µs] +//! short/4 time: [44.398 µs 45.858 µs 47.178 µs] +//! short/8 time: [81.236 µs 83.332 µs 85.419 µs] +//! short/16 time: [138.29 µs 139.76 µs 141.24 µs] +//! short/32 time: [149.67 µs 150.44 µs 151.30 µs] +//! short/64 time: [155.31 µs 155.90 µs 156.59 µs] +//! short/128 time: [156.33 µs 156.85 µs 157.44 µs] +//! medium/1 time: [105.47 µs 105.87 µs 106.36 µs] +//! medium/2 time: [157.09 µs 157.68 µs 158.40 µs] +//! medium/4 time: [293.69 µs 306.80 µs 318.11 µs] +//! medium/8 time: [594.88 µs 614.05 µs 633.18 µs] +//! medium/16 time: [848.28 µs 853.06 µs 858.68 µs] +//! medium/32 time: [916.31 µs 920.97 µs 926.27 µs] +//! medium/64 time: [939.99 µs 945.49 µs 951.30 µs] +//! medium/128 time: [918.89 µs 928.75 µs 938.41 µs] +//! ``` use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; From 2726b1934ebd9d12d976ce9e9a41783d9ab238a8 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 20 Mar 2024 18:07:45 +0000 Subject: [PATCH 4/4] pageserver: extra debug for test_secondary_downloads failures (#7183) - Enable debug logs for this test - Add some debug logging detail in downloader.rs - Add an info-level message in scheduler.rs that makes it obvious if a command is waiting for an existing task rather than spawning a new one. --- pageserver/src/tenant/secondary/downloader.rs | 13 ++++++++++++- pageserver/src/tenant/secondary/scheduler.rs | 1 + test_runner/regress/test_pageserver_secondary.py | 4 ++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index a595096133..82af7ed83b 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -534,7 +534,11 @@ impl<'a> TenantDownloader<'a> { .await .maybe_fatal_err(&context_msg)?; - tracing::debug!("Wrote local heatmap to {}", heatmap_path); + tracing::debug!( + "Wrote local heatmap to {}, with {} timelines", + heatmap_path, + heatmap.timelines.len() + ); // Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general // principle that deletions should be done before writes wherever possible, and so that we can use this @@ -547,6 +551,10 @@ impl<'a> TenantDownloader<'a> { // Download the layers in the heatmap for timeline in heatmap.timelines { if self.secondary_state.cancel.is_cancelled() { + tracing::debug!( + "Cancelled before downloading timeline {}", + timeline.timeline_id + ); return Ok(()); } @@ -764,10 +772,13 @@ impl<'a> TenantDownloader<'a> { } }; + tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + // Download heatmap layers that are not present on local disk, or update their // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { + tracing::debug!("Cancelled -- dropping out of layer loop"); return Ok(()); } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 58bdb54161..3bd7be782e 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -300,6 +300,7 @@ where let tenant_shard_id = job.get_tenant_shard_id(); let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) { + tracing::info!("Command already running, waiting for it"); barrier } else { let running = self.spawn_now(job); diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index e664547b69..2e57136607 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -432,6 +432,10 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): - Eviction of layers on the attached location results in deletion on the secondary location as well. """ + + # For debug of https://github.com/neondatabase/neon/issues/6966 + neon_env_builder.rust_log_override = "DEBUG" + neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=RemoteStorageKind.MOCK_S3,