mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
fix(pageserver): more aggressively yield in gc-compaction, degrade errors to warnings (#11469)
## Problem Fix various small issues discovered during gc-compaction rollout. ## Summary of changes - Log level changes: if errors are from gc-compaction, fire a warning instead of errors or critical errors. - Yield to L0 compaction more aggressively. Instead of checking every 1k keys, we check on every key. Sometimes a single key reconstruct takes a long time. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
error_run += 1;
|
||||
let backoff =
|
||||
exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
|
||||
log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
|
||||
log_compaction_error(
|
||||
&err,
|
||||
Some((error_run, backoff)),
|
||||
cancel.is_cancelled(),
|
||||
false,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error(
|
||||
err: &CompactionError,
|
||||
retry_info: Option<(u32, Duration)>,
|
||||
task_cancelled: bool,
|
||||
degrade_to_warning: bool,
|
||||
) {
|
||||
use CompactionError::*;
|
||||
|
||||
@@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error(
|
||||
}
|
||||
} else {
|
||||
match level {
|
||||
Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
|
||||
Level::ERROR => error!("Compaction failed: {err:#}"),
|
||||
Level::INFO => info!("Compaction failed: {err:#}"),
|
||||
level => unimplemented!("unexpected level {level:?}"),
|
||||
|
||||
@@ -1940,7 +1940,7 @@ impl Timeline {
|
||||
)
|
||||
.await;
|
||||
if let Err(err) = &res {
|
||||
log_compaction_error(err, None, cancel.is_cancelled());
|
||||
log_compaction_error(err, None, cancel.is_cancelled(), false);
|
||||
}
|
||||
res
|
||||
}
|
||||
@@ -6353,10 +6353,33 @@ impl Timeline {
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'.
|
||||
async fn reconstruct_value(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
data: ValueReconstructState,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.reconstruct_value_inner(key, request_lsn, data, false)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
|
||||
/// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
|
||||
async fn reconstruct_value_wo_critical_error(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
data: ValueReconstructState,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.reconstruct_value_inner(key, request_lsn, data, true)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn reconstruct_value_inner(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut data: ValueReconstructState,
|
||||
no_critical_error: bool,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// Perform WAL redo if needed
|
||||
data.records.reverse();
|
||||
@@ -6413,7 +6436,9 @@ impl Timeline {
|
||||
Ok(img) => img,
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(err)) => {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
if !no_critical_error {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
}
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
err.context("reconstruct a page image"),
|
||||
));
|
||||
|
||||
@@ -448,7 +448,7 @@ impl GcCompactionQueue {
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
|
||||
if let Err(err) = &res {
|
||||
log_compaction_error(err, None, cancel.is_cancelled());
|
||||
log_compaction_error(err, None, cancel.is_cancelled(), true);
|
||||
}
|
||||
match res {
|
||||
Ok(res) => Ok(res),
|
||||
@@ -2410,7 +2410,9 @@ impl Timeline {
|
||||
} else {
|
||||
lsn_split_points[i]
|
||||
};
|
||||
let img = self.reconstruct_value(key, request_lsn, state).await?;
|
||||
let img = self
|
||||
.reconstruct_value_wo_critical_error(key, request_lsn, state)
|
||||
.await?;
|
||||
Some((request_lsn, img))
|
||||
} else {
|
||||
None
|
||||
@@ -3106,8 +3108,6 @@ impl Timeline {
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
let mut keys_processed = 0;
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter
|
||||
.next_with_trace()
|
||||
.await
|
||||
@@ -3118,9 +3118,7 @@ impl Timeline {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
keys_processed += 1;
|
||||
let should_yield = yield_for_l0
|
||||
&& keys_processed % 1000 == 0
|
||||
&& self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
|
||||
Reference in New Issue
Block a user