fix(pageserver): log compaction errors with timeline ids (#11231)

## Problem

Makes it easier to debug.

## Summary of changes

Log compaction errors with timeline ids.

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-03-17 15:42:02 -04:00
committed by GitHub
parent 24f41bee5c
commit bb64beffbb
3 changed files with 54 additions and 22 deletions

View File

@@ -268,7 +268,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
error_run += 1;
let backoff =
exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
log_compaction_error(&err, error_run, backoff, cancel.is_cancelled());
log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
continue;
}
}
@@ -281,10 +281,9 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
}
}
fn log_compaction_error(
pub(crate) fn log_compaction_error(
err: &CompactionError,
error_count: u32,
sleep_duration: Duration,
retry_info: Option<(u32, Duration)>,
task_cancelled: bool,
) {
use CompactionError::*;
@@ -318,14 +317,26 @@ fn log_compaction_error(
}
};
match level {
Level::ERROR => {
error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
if let Some((error_count, sleep_duration)) = retry_info {
match level {
Level::ERROR => {
error!(
"Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}"
)
}
Level::INFO => {
info!(
"Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}"
)
}
level => unimplemented!("unexpected level {level:?}"),
}
Level::INFO => {
info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}")
} else {
match level {
Level::ERROR => error!("Compaction failed: {err:#}"),
Level::INFO => info!("Compaction failed: {err:#}"),
level => unimplemented!("unexpected level {level:?}"),
}
level => unimplemented!("unexpected level {level:?}"),
}
}

View File

@@ -89,6 +89,7 @@ use super::remote_timeline_client::index::{GcCompactionState, IndexPart};
use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
use super::secondary::heatmap::HeatMapLayer;
use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
use super::tasks::log_compaction_error;
use super::upload_queue::NotInitialized;
use super::{
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
@@ -1856,18 +1857,23 @@ impl Timeline {
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<CompactionOutcome, CompactionError> {
self.compact_with_options(
cancel,
CompactOptions {
flags,
compact_key_range: None,
compact_lsn_range: None,
sub_compaction: false,
sub_compaction_max_job_size_mb: None,
},
ctx,
)
.await
let res = self
.compact_with_options(
cancel,
CompactOptions {
flags,
compact_key_range: None,
compact_lsn_range: None,
sub_compaction: false,
sub_compaction_max_job_size_mb: None,
},
ctx,
)
.await;
if let Err(err) = &res {
log_compaction_error(err, None, cancel.is_cancelled());
}
res
}
/// Outermost timeline compaction operation; downloads needed layers.

View File

@@ -56,6 +56,7 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::{
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
};
use crate::tenant::tasks::log_compaction_error;
use crate::tenant::timeline::{
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
ResidentLayer, drop_rlock,
@@ -440,6 +441,20 @@ impl GcCompactionQueue {
ctx: &RequestContext,
gc_block: &GcBlock,
timeline: &Arc<Timeline>,
) -> Result<CompactionOutcome, CompactionError> {
let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
if let Err(err) = &res {
log_compaction_error(err, None, cancel.is_cancelled());
}
res
}
async fn iteration_inner(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
gc_block: &GcBlock,
timeline: &Arc<Timeline>,
) -> Result<CompactionOutcome, CompactionError> {
let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
return Err(CompactionError::AlreadyRunning(