fix(pageserver): make repartition error critical (#10872)

## Problem

Read errors during repartition should be a critical error.

## Summary of changes

<del>We only have one call site</del> We have two call sites of
`repartition` where one of them is during the initial image upload
optimization and another is during image layer creation, so I added a
`critical!` here instead of inside `collect_keyspace`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-02-18 15:19:23 -05:00
committed by GitHub
parent 9d074db18d
commit a4e3989c8d
5 changed files with 31 additions and 4 deletions

View File

@@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler(
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;

View File

@@ -3150,6 +3150,12 @@ impl Tenant {
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
// shouldn't block compaction.
CompactionError::Offload(_) => {}
CompactionError::CollectKeySpaceError(err) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
}
CompactionError::Other(err) => {
self.compaction_circuit_breaker
.lock()

View File

@@ -287,6 +287,7 @@ fn log_compaction_error(
sleep_duration: Duration,
task_cancelled: bool,
) {
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::tenant::upload_queue::NotInitialized;
use crate::tenant::PageReconstructError;
use CompactionError::*;
@@ -294,6 +295,8 @@ fn log_compaction_error(
let level = match err {
ShuttingDown => return,
Offload(_) => Level::ERROR,
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
CollectKeySpaceError(_) => Level::ERROR,
_ if task_cancelled => Level::INFO,
Other(err) => {
let root_cause = err.root_cause();

View File

@@ -1881,7 +1881,7 @@ impl Timeline {
// Signal compaction failure to avoid L0 flush stalls when it's broken.
match result {
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
Err(CompactionError::Other(_)) => {
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
}
// Don't change the current value on offload failure or shutdown. We don't want to
@@ -4604,7 +4604,10 @@ impl Timeline {
));
}
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
let (dense_ks, sparse_ks) = self
.collect_keyspace(lsn, ctx)
.await
.map_err(CompactionError::CollectKeySpaceError)?;
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
let sparse_partitioning = SparseKeyPartitioning {
parts: vec![sparse_ks],
@@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError {
#[error("Failed to offload timeline: {0}")]
Offload(OffloadError),
/// Compaction cannot be done right now; page reconstruction and so on.
#[error("Failed to collect keyspace: {0}")]
CollectKeySpaceError(CollectKeySpaceError),
#[error(transparent)]
Other(anyhow::Error),
}

View File

@@ -11,7 +11,8 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
Timeline,
};
use anyhow::{anyhow, bail, Context};
@@ -31,6 +32,7 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::pgdatadir_mapping::CollectKeySpaceError;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::gc_block::GcBlock;
@@ -781,7 +783,17 @@ impl Timeline {
//
// Suppress error when it's due to cancellation
if !self.cancel.is_cancelled() && !err.is_cancelled() {
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
if let CompactionError::CollectKeySpaceError(
CollectKeySpaceError::Decode(_)
| CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
) = err
{
critical!("could not compact, repartitioning keyspace failed: {err:?}");
} else {
tracing::error!(
"could not compact, repartitioning keyspace failed: {err:?}"
);
}
}
}
};