From cad0dca4b843759dbd8d5396c702ba4ec47de02f Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Sat, 18 Nov 2023 12:43:54 +0100 Subject: [PATCH 1/4] compute_ctl: Remove deprecated flag `--file-cache-on-disk` (#5622) See neondatabase/cloud#7516 for more. --- compute_tools/src/bin/compute_ctl.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 81d4320b14..7f22bda13e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -479,13 +479,6 @@ fn cli() -> clap::Command { ) .value_name("FILECACHE_CONNSTR"), ) - .arg( - // DEPRECATED, NO LONGER DOES ANYTHING. - // See https://github.com/neondatabase/cloud/issues/7516 - Arg::new("file-cache-on-disk") - .long("file-cache-on-disk") - .action(clap::ArgAction::SetTrue), - ) } #[test] From 3b3f040be339efd89dd7057ebdded172f753d8ae Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 19 Nov 2023 15:16:31 +0100 Subject: [PATCH 2/4] fix(background_tasks): first backoff, compaction error stacktraces (#5881) First compaction/gc error backoff starts from 0 which is less than 2s what it was before #5672. This is now fixed to be the intended 2**n. Additionally noticed the `compaction_iteration` creating an `anyhow::Error` via `into()` always captures a stacktrace even if we had a stacktraceful anyhow error within the CompactionError because there is no stable api for querying that. --- pageserver/src/tenant.rs | 14 ++++---------- pageserver/src/tenant/tasks.rs | 4 ++-- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 758f8b15a1..4025e93f66 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1649,22 +1649,16 @@ impl Tenant { /// This function is periodically called by compactor task. /// Also it can be explicitly requested per timeline through page server /// api's 'compact' command. - pub async fn compaction_iteration( + async fn compaction_iteration( &self, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result<()> { - // Don't start doing work during shutdown - if let TenantState::Stopping { .. } = self.current_state() { + ) -> anyhow::Result<(), timeline::CompactionError> { + // Don't start doing work during shutdown, or when broken, we do not need those in the logs + if !self.is_active() { return Ok(()); } - // We should only be called once the tenant has activated. - anyhow::ensure!( - self.is_active(), - "Cannot run compaction iteration on inactive tenant" - ); - { let conf = self.tenant_conf.read().unwrap(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index eb77f7c83a..381d731b79 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -180,7 +180,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { // Run compaction if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await { let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count, + error_run_count + 1, 1.0, MAX_BACKOFF_SECS, ); @@ -261,7 +261,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { .await; if let Err(e) = res { let wait_duration = backoff::exponential_backoff_duration_seconds( - error_run_count, + error_run_count + 1, 1.0, MAX_BACKOFF_SECS, ); From d22dce2e31093205d99cdc4aaff68499f8b6b749 Mon Sep 17 00:00:00 2001 From: John Spray Date: Sun, 19 Nov 2023 15:21:16 +0100 Subject: [PATCH 3/4] pageserver: shut down idle walredo processes (#5877) The longer a pageserver runs, the more walredo processes it accumulates from tenants that are touched intermittently (e.g. by availability checks). This can lead to getting OOM killed. Changes: - Add an Instant recording the last use of the walredo process for a tenant - After compaction iteration in the background task, check for idleness and stop the walredo process if idle for more than 10x compaction period. Cc: #3620 Co-authored-by: Joonas Koivunen Co-authored-by: Shany Pozin --- pageserver/src/tenant.rs | 10 ++++++++++ pageserver/src/tenant/tasks.rs | 4 ++++ pageserver/src/walredo.rs | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 4025e93f66..04fe9db76a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -291,6 +291,16 @@ impl From for WalRedoManager { } impl WalRedoManager { + pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { + match self { + Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout), + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + } + } + } + pub async fn request_redo( &self, key: crate::repository::Key, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 381d731b79..e59001297c 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -198,6 +198,10 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction); + // Perhaps we did no work and the walredo process has been idle for some time: + // give it a chance to shut down to avoid leaving walredo process running indefinitely. + tenant.walredo_mgr.maybe_quiesce(period * 10); + // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index ccdf621c30..9290940acf 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -91,6 +91,7 @@ struct ProcessOutput { pub struct PostgresRedoManager { tenant_id: TenantId, conf: &'static PageServerConf, + last_redo_at: std::sync::Mutex>, redo_process: RwLock>>, } @@ -187,10 +188,26 @@ impl PostgresRedoManager { PostgresRedoManager { tenant_id, conf, + last_redo_at: std::sync::Mutex::default(), redo_process: RwLock::new(None), } } + /// This type doesn't have its own background task to check for idleness: we + /// rely on our owner calling this function periodically in its own housekeeping + /// loops. + pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { + if let Ok(g) = self.last_redo_at.try_lock() { + if let Some(last_redo_at) = *g { + if last_redo_at.elapsed() >= idle_timeout { + drop(g); + let mut guard = self.redo_process.write().unwrap(); + *guard = None; + } + } + } + } + /// /// Process one request for WAL redo using wal-redo postgres /// @@ -205,6 +222,8 @@ impl PostgresRedoManager { wal_redo_timeout: Duration, pg_version: u32, ) -> anyhow::Result { + *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); + let (rel, blknum) = key_to_rel_block(key).context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; From ac08072d2e9f4091358f47ab86af869bd8ee9231 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Sun, 19 Nov 2023 15:57:39 +0100 Subject: [PATCH 4/4] fix(layer): VirtualFile opening and read errors can be caused by contention (#5880) A very low number of layer loads have been marked wrongly as permanent, as I did not remember that `VirtualFile::open` or reading could fail transiently for contention. Return separate errors for transient and persistent errors from `{Delta,Image}LayerInner::load`. Includes drive-by comment changes. The implementation looks quite ugly because having the same type be both the inner (operation error) and outer (critical error), but with the alternatives I tried I did not find a better way. --- pageserver/src/metrics.rs | 2 +- .../src/tenant/storage_layer/delta_layer.rs | 30 +++++++++---- .../src/tenant/storage_layer/image_layer.rs | 33 +++++++++++---- pageserver/src/tenant/storage_layer/layer.rs | 42 ++++++++++++------- pageserver/src/walredo.rs | 7 ++-- 5 files changed, 77 insertions(+), 37 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 4b52f07326..d5915f4c98 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -638,7 +638,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[ /// /// Operations: /// - open ([`std::fs::OpenOptions::open`]) -/// - close (dropping [`std::fs::File`]) +/// - close (dropping [`crate::virtual_file::VirtualFile`]) /// - close-by-replace (close by replacement algorithm) /// - read (`read_at`) /// - write (`write_at`) diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 4ccce3d6bd..79f37dcb2d 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -289,7 +289,9 @@ impl DeltaLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, ctx).await?; + let loaded = DeltaLayerInner::load(&path, None, ctx) + .await + .and_then(|res| res)?; // not production code let actual_filename = path.file_name().unwrap().to_owned(); @@ -610,18 +612,28 @@ impl Drop for DeltaLayerWriter { } impl DeltaLayerInner { + /// Returns nested result following Result, Critical>: + /// - inner has the success or transient failure + /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, summary: Option, ctx: &RequestContext, - ) -> anyhow::Result { - let file = VirtualFile::open(path) - .await - .with_context(|| format!("Failed to open file '{path}'"))?; + ) -> Result, anyhow::Error> { + let file = match VirtualFile::open(path).await { + Ok(file) => file, + Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), + }; let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; - let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let summary_blk = match file.read_blk(0, ctx).await { + Ok(blk) => blk, + Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), + }; + + // TODO: this should be an assertion instead; see ImageLayerInner::load + let actual_summary = + Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?; if let Some(mut expected_summary) = summary { // production code path @@ -636,11 +648,11 @@ impl DeltaLayerInner { } } - Ok(DeltaLayerInner { + Ok(Ok(DeltaLayerInner { file, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, - }) + })) } pub(super) async fn get_value_reconstruct_data( diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index b7b9ca69b0..c38a9f6883 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -249,7 +249,9 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?; + let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx) + .await + .and_then(|res| res)?; // not production code let actual_filename = path.file_name().unwrap().to_owned(); @@ -295,18 +297,31 @@ impl ImageLayer { } impl ImageLayerInner { + /// Returns nested result following Result, Critical>: + /// - inner has the success or transient failure + /// - outer has the permanent failure pub(super) async fn load( path: &Utf8Path, lsn: Lsn, summary: Option, ctx: &RequestContext, - ) -> anyhow::Result { - let file = VirtualFile::open(path) - .await - .with_context(|| format!("Failed to open file '{}'", path))?; + ) -> Result, anyhow::Error> { + let file = match VirtualFile::open(path).await { + Ok(file) => file, + Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), + }; let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; - let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let summary_blk = match file.read_blk(0, ctx).await { + Ok(blk) => blk, + Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), + }; + + // length is the only way how this could fail, so it's not actually likely at all unless + // read_blk returns wrong sized block. + // + // TODO: confirm and make this into assertion + let actual_summary = + Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?; if let Some(mut expected_summary) = summary { // production code path @@ -322,12 +337,12 @@ impl ImageLayerInner { } } - Ok(ImageLayerInner { + Ok(Ok(ImageLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, lsn, file, - }) + })) } pub(super) async fn get_value_reconstruct_data( diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 17df39733f..f28f1c9444 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -868,6 +868,9 @@ impl LayerInner { } Ok((Err(e), _permit)) => { // FIXME: this should be with the spawned task and be cancellation sensitive + // + // while we should not need this, this backoff has turned out to be useful with + // a bug of unexpectedly deleted remote layer file (#5787). let consecutive_failures = self.consecutive_failures.fetch_add(1, Ordering::Relaxed); tracing::error!(consecutive_failures, "layer file download failed: {e:#}"); @@ -1196,7 +1199,7 @@ impl DownloadedLayer { )); delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx) .await - .map(LayerKind::Delta) + .map(|res| res.map(LayerKind::Delta)) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1207,23 +1210,32 @@ impl DownloadedLayer { )); image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx) .await - .map(LayerKind::Image) - } - // this will be a permanent failure - .context("load layer"); + .map(|res| res.map(LayerKind::Image)) + }; - if let Err(e) = res.as_ref() { - LAYER_IMPL_METRICS.inc_permanent_loading_failures(); - // TODO(#5815): we are not logging all errors, so temporarily log them here as well - tracing::error!("layer loading failed permanently: {e:#}"); + match res { + Ok(Ok(layer)) => Ok(Ok(layer)), + Ok(Err(transient)) => Err(transient), + Err(permanent) => { + LAYER_IMPL_METRICS.inc_permanent_loading_failures(); + // TODO(#5815): we are not logging all errors, so temporarily log them **once** + // here as well + let permanent = permanent.context("load layer"); + tracing::error!("layer loading failed permanently: {permanent:#}"); + Ok(Err(permanent)) + } } - res }; - self.kind.get_or_init(init).await.as_ref().map_err(|e| { - // errors are not clonabled, cannot but stringify - // test_broken_timeline matches this string - anyhow::anyhow!("layer loading failed: {e:#}") - }) + self.kind + .get_or_try_init(init) + // return transient errors using `?` + .await? + .as_ref() + .map_err(|e| { + // errors are not clonabled, cannot but stringify + // test_broken_timeline matches this string + anyhow::anyhow!("layer loading failed: {e:#}") + }) } async fn get_value_reconstruct_data( diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 9290940acf..5d8cc0e181 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -367,12 +367,13 @@ impl PostgresRedoManager { self.apply_record_neon(key, &mut page, *record_lsn, record)?; } // Success! - let end_time = Instant::now(); - let duration = end_time.duration_since(start_time); + let duration = start_time.elapsed(); + // FIXME: using the same metric here creates a bimodal distribution by default, and because + // there could be multiple batch sizes this would be N+1 modal. WAL_REDO_TIME.observe(duration.as_secs_f64()); debug!( - "neon applied {} WAL records in {} ms to reconstruct page image at LSN {}", + "neon applied {} WAL records in {} us to reconstruct page image at LSN {}", records.len(), duration.as_micros(), lsn