fix(pageserver): more aggressively yield in gc-compaction, degrade errors to warnings (#11469)

## Problem

Fix various small issues discovered during gc-compaction rollout.

## Summary of changes

- Log level changes: if errors are from gc-compaction, fire a warning
instead of errors or critical errors.
- Yield to L0 compaction more aggressively. Instead of checking every 1k
keys, we check on every key. Sometimes a single key reconstruct takes a
long time.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-04-07 17:19:06 -04:00
committed by GitHub
parent 99d8788756
commit 0875dacce0
3 changed files with 39 additions and 9 deletions

View File

@@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
error_run += 1;
let backoff =
exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
log_compaction_error(
&err,
Some((error_run, backoff)),
cancel.is_cancelled(),
false,
);
continue;
}
}
@@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error(
err: &CompactionError,
retry_info: Option<(u32, Duration)>,
task_cancelled: bool,
degrade_to_warning: bool,
) {
use CompactionError::*;
@@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error(
}
} else {
match level {
Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
Level::ERROR => error!("Compaction failed: {err:#}"),
Level::INFO => info!("Compaction failed: {err:#}"),
level => unimplemented!("unexpected level {level:?}"),

View File

@@ -1940,7 +1940,7 @@ impl Timeline {
)
.await;
if let Err(err) = &res {
log_compaction_error(err, None, cancel.is_cancelled());
log_compaction_error(err, None, cancel.is_cancelled(), false);
}
res
}
@@ -6353,10 +6353,33 @@ impl Timeline {
/// Reconstruct a value, using the given base image and WAL records in 'data'.
async fn reconstruct_value(
&self,
key: Key,
request_lsn: Lsn,
data: ValueReconstructState,
) -> Result<Bytes, PageReconstructError> {
self.reconstruct_value_inner(key, request_lsn, data, false)
.await
}
/// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
/// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
async fn reconstruct_value_wo_critical_error(
&self,
key: Key,
request_lsn: Lsn,
data: ValueReconstructState,
) -> Result<Bytes, PageReconstructError> {
self.reconstruct_value_inner(key, request_lsn, data, true)
.await
}
async fn reconstruct_value_inner(
&self,
key: Key,
request_lsn: Lsn,
mut data: ValueReconstructState,
no_critical_error: bool,
) -> Result<Bytes, PageReconstructError> {
// Perform WAL redo if needed
data.records.reverse();
@@ -6413,7 +6436,9 @@ impl Timeline {
Ok(img) => img,
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
Err(walredo::Error::Other(err)) => {
critical!("walredo failure during page reconstruction: {err:?}");
if !no_critical_error {
critical!("walredo failure during page reconstruction: {err:?}");
}
return Err(PageReconstructError::WalRedo(
err.context("reconstruct a page image"),
));

View File

@@ -448,7 +448,7 @@ impl GcCompactionQueue {
) -> Result<CompactionOutcome, CompactionError> {
let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
if let Err(err) = &res {
log_compaction_error(err, None, cancel.is_cancelled());
log_compaction_error(err, None, cancel.is_cancelled(), true);
}
match res {
Ok(res) => Ok(res),
@@ -2410,7 +2410,9 @@ impl Timeline {
} else {
lsn_split_points[i]
};
let img = self.reconstruct_value(key, request_lsn, state).await?;
let img = self
.reconstruct_value_wo_critical_error(key, request_lsn, state)
.await?;
Some((request_lsn, img))
} else {
None
@@ -3106,8 +3108,6 @@ impl Timeline {
// the key and LSN range are determined. However, to keep things simple here, we still
// create this writer, and discard the writer in the end.
let mut keys_processed = 0;
while let Some(((key, lsn, val), desc)) = merge_iter
.next_with_trace()
.await
@@ -3118,9 +3118,7 @@ impl Timeline {
return Err(CompactionError::ShuttingDown);
}
keys_processed += 1;
let should_yield = yield_for_l0
&& keys_processed % 1000 == 0
&& self
.l0_compaction_trigger
.notified()