diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 034e5f8c91..54588e788c 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { error_run += 1; let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); - log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled()); + log_compaction_error( + &err, + Some((error_run, backoff)), + cancel.is_cancelled(), + false, + ); continue; } } @@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error( err: &CompactionError, retry_info: Option<(u32, Duration)>, task_cancelled: bool, + degrade_to_warning: bool, ) { use CompactionError::*; @@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error( } } else { match level { + Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), Level::ERROR => error!("Compaction failed: {err:#}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6ca3704bc1..5174da0f43 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1940,7 +1940,7 @@ impl Timeline { ) .await; if let Err(err) = &res { - log_compaction_error(err, None, cancel.is_cancelled()); + log_compaction_error(err, None, cancel.is_cancelled(), false); } res } @@ -6353,10 +6353,33 @@ impl Timeline { /// Reconstruct a value, using the given base image and WAL records in 'data'. async fn reconstruct_value( + &self, + key: Key, + request_lsn: Lsn, + data: ValueReconstructState, + ) -> Result { + self.reconstruct_value_inner(key, request_lsn, data, false) + .await + } + + /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because + /// sometimes it is expected to fail due to unreplayable history described in . + async fn reconstruct_value_wo_critical_error( + &self, + key: Key, + request_lsn: Lsn, + data: ValueReconstructState, + ) -> Result { + self.reconstruct_value_inner(key, request_lsn, data, true) + .await + } + + async fn reconstruct_value_inner( &self, key: Key, request_lsn: Lsn, mut data: ValueReconstructState, + no_critical_error: bool, ) -> Result { // Perform WAL redo if needed data.records.reverse(); @@ -6413,7 +6436,9 @@ impl Timeline { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), Err(walredo::Error::Other(err)) => { - critical!("walredo failure during page reconstruction: {err:?}"); + if !no_critical_error { + critical!("walredo failure during page reconstruction: {err:?}"); + } return Err(PageReconstructError::WalRedo( err.context("reconstruct a page image"), )); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 73f6691f14..8403c0a7d9 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -448,7 +448,7 @@ impl GcCompactionQueue { ) -> Result { let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await; if let Err(err) = &res { - log_compaction_error(err, None, cancel.is_cancelled()); + log_compaction_error(err, None, cancel.is_cancelled(), true); } match res { Ok(res) => Ok(res), @@ -2410,7 +2410,9 @@ impl Timeline { } else { lsn_split_points[i] }; - let img = self.reconstruct_value(key, request_lsn, state).await?; + let img = self + .reconstruct_value_wo_critical_error(key, request_lsn, state) + .await?; Some((request_lsn, img)) } else { None @@ -3106,8 +3108,6 @@ impl Timeline { // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. - let mut keys_processed = 0; - while let Some(((key, lsn, val), desc)) = merge_iter .next_with_trace() .await @@ -3118,9 +3118,7 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - keys_processed += 1; let should_yield = yield_for_l0 - && keys_processed % 1000 == 0 && self .l0_compaction_trigger .notified()