From a4e3989c8da1bf0dc8a88b35a010055c29afd43f Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 18 Feb 2025 15:19:23 -0500 Subject: [PATCH] fix(pageserver): make repartition error critical (#10872) ## Problem Read errors during repartition should be a critical error. ## Summary of changes We only have one call site We have two call sites of `repartition` where one of them is during the initial image upload optimization and another is during image layer creation, so I added a `critical!` here instead of inside `collect_keyspace`. --------- Signed-off-by: Alex Chi Z --- pageserver/src/http/routes.rs | 1 + pageserver/src/tenant.rs | 6 ++++++ pageserver/src/tenant/tasks.rs | 3 +++ pageserver/src/tenant/timeline.rs | 9 +++++++-- pageserver/src/tenant/timeline/compaction.rs | 16 ++++++++++++++-- 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index c2d5c3a933..56a84a98a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2395,6 +2395,7 @@ async fn timeline_checkpoint_handler( match e { CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), + CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e) } )?; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 5d917da574..efb35625f2 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3150,6 +3150,12 @@ impl Tenant { // Offload failures don't trip the circuit breaker, since they're cheap to retry and // shouldn't block compaction. CompactionError::Offload(_) => {} + CompactionError::CollectKeySpaceError(err) => { + self.compaction_circuit_breaker + .lock() + .unwrap() + .fail(&CIRCUIT_BREAKERS_BROKEN, err); + } CompactionError::Other(err) => { self.compaction_circuit_breaker .lock() diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 029444e973..5e63f59fd8 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -287,6 +287,7 @@ fn log_compaction_error( sleep_duration: Duration, task_cancelled: bool, ) { + use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::tenant::upload_queue::NotInitialized; use crate::tenant::PageReconstructError; use CompactionError::*; @@ -294,6 +295,8 @@ fn log_compaction_error( let level = match err { ShuttingDown => return, Offload(_) => Level::ERROR, + CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO, + CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { let root_cause = err.root_cause(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 582825e890..ea966d2b43 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1881,7 +1881,7 @@ impl Timeline { // Signal compaction failure to avoid L0 flush stalls when it's broken. match result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(CompactionError::Other(_)) => { + Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } // Don't change the current value on offload failure or shutdown. We don't want to @@ -4604,7 +4604,10 @@ impl Timeline { )); } - let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?; + let (dense_ks, sparse_ks) = self + .collect_keyspace(lsn, ctx) + .await + .map_err(CompactionError::CollectKeySpaceError)?; let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size); let sparse_partitioning = SparseKeyPartitioning { parts: vec![sparse_ks], @@ -5319,6 +5322,8 @@ pub(crate) enum CompactionError { #[error("Failed to offload timeline: {0}")] Offload(OffloadError), /// Compaction cannot be done right now; page reconstruction and so on. + #[error("Failed to collect keyspace: {0}")] + CollectKeySpaceError(CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9e082d74b5..4e4f906d78 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -11,7 +11,8 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, - ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, + ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration, + Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -31,6 +32,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; +use crate::pgdatadir_mapping::CollectKeySpaceError; use crate::statvfs::Statvfs; use crate::tenant::checks::check_valid_layermap; use crate::tenant::gc_block::GcBlock; @@ -781,7 +783,17 @@ impl Timeline { // // Suppress error when it's due to cancellation if !self.cancel.is_cancelled() && !err.is_cancelled() { - tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); + if let CompactionError::CollectKeySpaceError( + CollectKeySpaceError::Decode(_) + | CollectKeySpaceError::PageRead(PageReconstructError::MissingKey(_)), + ) = err + { + critical!("could not compact, repartitioning keyspace failed: {err:?}"); + } else { + tracing::error!( + "could not compact, repartitioning keyspace failed: {err:?}" + ); + } } } };