mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 21:12:55 +00:00
fix(pageserver): make repartition error critical (#10872)
## Problem Read errors during repartition should be a critical error. ## Summary of changes <del>We only have one call site</del> We have two call sites of `repartition` where one of them is during the initial image upload optimization and another is during image layer creation, so I added a `critical!` here instead of inside `collect_keyspace`. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler(
|
|||||||
match e {
|
match e {
|
||||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||||
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||||
|
CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||||
}
|
}
|
||||||
)?;
|
)?;
|
||||||
|
|||||||
@@ -3150,6 +3150,12 @@ impl Tenant {
|
|||||||
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
// Offload failures don't trip the circuit breaker, since they're cheap to retry and
|
||||||
// shouldn't block compaction.
|
// shouldn't block compaction.
|
||||||
CompactionError::Offload(_) => {}
|
CompactionError::Offload(_) => {}
|
||||||
|
CompactionError::CollectKeySpaceError(err) => {
|
||||||
|
self.compaction_circuit_breaker
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.fail(&CIRCUIT_BREAKERS_BROKEN, err);
|
||||||
|
}
|
||||||
CompactionError::Other(err) => {
|
CompactionError::Other(err) => {
|
||||||
self.compaction_circuit_breaker
|
self.compaction_circuit_breaker
|
||||||
.lock()
|
.lock()
|
||||||
|
|||||||
@@ -287,6 +287,7 @@ fn log_compaction_error(
|
|||||||
sleep_duration: Duration,
|
sleep_duration: Duration,
|
||||||
task_cancelled: bool,
|
task_cancelled: bool,
|
||||||
) {
|
) {
|
||||||
|
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||||
use crate::tenant::upload_queue::NotInitialized;
|
use crate::tenant::upload_queue::NotInitialized;
|
||||||
use crate::tenant::PageReconstructError;
|
use crate::tenant::PageReconstructError;
|
||||||
use CompactionError::*;
|
use CompactionError::*;
|
||||||
@@ -294,6 +295,8 @@ fn log_compaction_error(
|
|||||||
let level = match err {
|
let level = match err {
|
||||||
ShuttingDown => return,
|
ShuttingDown => return,
|
||||||
Offload(_) => Level::ERROR,
|
Offload(_) => Level::ERROR,
|
||||||
|
CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
|
||||||
|
CollectKeySpaceError(_) => Level::ERROR,
|
||||||
_ if task_cancelled => Level::INFO,
|
_ if task_cancelled => Level::INFO,
|
||||||
Other(err) => {
|
Other(err) => {
|
||||||
let root_cause = err.root_cause();
|
let root_cause = err.root_cause();
|
||||||
|
|||||||
@@ -1881,7 +1881,7 @@ impl Timeline {
|
|||||||
// Signal compaction failure to avoid L0 flush stalls when it's broken.
|
// Signal compaction failure to avoid L0 flush stalls when it's broken.
|
||||||
match result {
|
match result {
|
||||||
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
|
||||||
Err(CompactionError::Other(_)) => {
|
Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
|
||||||
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
self.compaction_failed.store(true, AtomicOrdering::Relaxed)
|
||||||
}
|
}
|
||||||
// Don't change the current value on offload failure or shutdown. We don't want to
|
// Don't change the current value on offload failure or shutdown. We don't want to
|
||||||
@@ -4604,7 +4604,10 @@ impl Timeline {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
|
let (dense_ks, sparse_ks) = self
|
||||||
|
.collect_keyspace(lsn, ctx)
|
||||||
|
.await
|
||||||
|
.map_err(CompactionError::CollectKeySpaceError)?;
|
||||||
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
|
||||||
let sparse_partitioning = SparseKeyPartitioning {
|
let sparse_partitioning = SparseKeyPartitioning {
|
||||||
parts: vec![sparse_ks],
|
parts: vec![sparse_ks],
|
||||||
@@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError {
|
|||||||
#[error("Failed to offload timeline: {0}")]
|
#[error("Failed to offload timeline: {0}")]
|
||||||
Offload(OffloadError),
|
Offload(OffloadError),
|
||||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||||
|
#[error("Failed to collect keyspace: {0}")]
|
||||||
|
CollectKeySpaceError(CollectKeySpaceError),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Other(anyhow::Error),
|
Other(anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ use std::sync::Arc;
|
|||||||
use super::layer_manager::LayerManager;
|
use super::layer_manager::LayerManager;
|
||||||
use super::{
|
use super::{
|
||||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
|
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
|
||||||
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
|
ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
|
||||||
|
Timeline,
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{anyhow, bail, Context};
|
use anyhow::{anyhow, bail, Context};
|
||||||
@@ -31,6 +32,7 @@ use utils::id::TimelineId;
|
|||||||
|
|
||||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||||
use crate::page_cache;
|
use crate::page_cache;
|
||||||
|
use crate::pgdatadir_mapping::CollectKeySpaceError;
|
||||||
use crate::statvfs::Statvfs;
|
use crate::statvfs::Statvfs;
|
||||||
use crate::tenant::checks::check_valid_layermap;
|
use crate::tenant::checks::check_valid_layermap;
|
||||||
use crate::tenant::gc_block::GcBlock;
|
use crate::tenant::gc_block::GcBlock;
|
||||||
@@ -781,7 +783,17 @@ impl Timeline {
|
|||||||
//
|
//
|
||||||
// Suppress error when it's due to cancellation
|
// Suppress error when it's due to cancellation
|
||||||
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
if !self.cancel.is_cancelled() && !err.is_cancelled() {
|
||||||
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
|
if let CompactionError::CollectKeySpaceError(
|
||||||
|
CollectKeySpaceError::Decode(_)
|
||||||
|
| CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)),
|
||||||
|
) = err
|
||||||
|
{
|
||||||
|
critical!("could not compact, repartitioning keyspace failed: {err:?}");
|
||||||
|
} else {
|
||||||
|
tracing::error!(
|
||||||
|
"could not compact, repartitioning keyspace failed: {err:?}"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user