mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-05 12:32:54 +00:00
fix(pageserver): make repartition error critical (#10872)
## Problem Read errors during repartition should be a critical error. ## Summary of changes <del>We only have one call site</del> We have two call sites of `repartition` where one of them is during the initial image upload optimization and another is during image layer creation, so I added a `critical!` here instead of inside `collect_keyspace`. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler(
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||
}
|
||||
)?;
|
||||
|
||||
@@ -3150,6 +3150,12 @@ impl Tenant {
|
||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||
// shouldn't block compaction.
|
||||
CompactionError::Offload(_) => {}
|
||||
CompactionError::CollectKeySpaceError(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
.unwrap()
|
||||
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||
}
|
||||
CompactionError::Other(err) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
|
||||
@@ -287,6 +287,7 @@ fn log_compaction_error(
|
||||
sleep_duration: Duration,
|
||||
task_cancelled: bool,
|
||||
) {
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::tenant::upload_queue::NotInitialized;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use CompactionError::*;
|
||||
@@ -294,6 +295,8 @@ fn log_compaction_error(
|
||||
let level = match err {
|
||||
ShuttingDown => return,
|
||||
Offload(_) => Level::ERROR,
|
||||
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
|
||||
CollectKeySpaceError(_) => Level::ERROR,
|
||||
_ if task_cancelled => Level::INFO,
|
||||
Other(err) => {
|
||||
let root_cause = err.root_cause();
|
||||
|
||||
@@ -1881,7 +1881,7 @@ impl Timeline {
|
||||
// Signal compaction failure to avoid L0 flush stalls when it's broken.
|
||||
match result {
|
||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||
Err(CompactionError::Other(_)) => {
|
||||
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||
}
|
||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||
@@ -4604,7 +4604,10 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
||||
let (dense_ks, sparse_ks) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.await
|
||||
.map_err(CompactionError::CollectKeySpaceError)?;
|
||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||
let sparse_partitioning = SparseKeyPartitioning {
|
||||
parts: vec![sparse_ks],
|
||||
@@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError {
|
||||
#[error("Failed to offload timeline: {0}")]
|
||||
Offload(OffloadError),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error("Failed to collect keyspace: {0}")]
|
||||
CollectKeySpaceError(CollectKeySpaceError),
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
@@ -11,7 +11,8 @@ use std::sync::Arc;
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
|
||||
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
|
||||
ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
|
||||
Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
@@ -31,6 +32,7 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::gc_block::GcBlock;
|
||||
@@ -781,7 +783,17 @@ impl Timeline {
|
||||
//
|
||||
// Suppress error when it's due to cancellation
|
||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
if let CompactionError::CollectKeySpaceError(
|
||||
CollectKeySpaceError::Decode(_)
|
||||
| CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
|
||||
) = err
|
||||
{
|
||||
critical!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||
} else {
|
||||
tracing::error!(
|
||||
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user