diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f9fdc143b4..17dc821345 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -99,6 +99,7 @@ use crate::tenant::remote_timeline_client::{ INITDB_PATH, MaybeDeletedIndexPart, remote_initdb_archive_path, }; use crate::tenant::storage_layer::{DeltaLayer, ImageLayer}; +use crate::tenant::timeline::CheckOtherForCancel; use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; @@ -3261,11 +3262,11 @@ impl TenantShard { /// Trips the compaction circuit breaker if appropriate. pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) { + if err.is_cancel(CheckOtherForCancel::No /* XXX flip this to Yes so that all the Other() errors that are cancel don't trip the circuit breaker? */) { + return; + } match err { - err if err.is_cancel() => {} - CompactionError::ShuttingDown => (), - // Offload failures don't trip the circuit breaker, since they're cheap to retry and - // shouldn't block compaction. + CompactionError::ShuttingDown => unreachable!("is_cancel"), CompactionError::Offload(_) => {} CompactionError::CollectKeySpaceError(err) => { // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch. @@ -3280,7 +3281,7 @@ impl TenantShard { .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } - CompactionError::AlreadyRunning(_) => {} + CompactionError::AlreadyRunning(_) => unreachable!("is_cancel, but XXX why?"), } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 954dd38bb4..193177cbb5 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -17,17 +17,14 @@ use tracing::*; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; -use utils::sync::gate::GateError; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; -use crate::tenant::blob_io::WriteBlobError; use crate::tenant::throttle::Stats; -use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::compaction::CompactionOutcome; +use crate::tenant::timeline::{CheckOtherForCancel, CompactionError}; use crate::tenant::{TenantShard, TenantState}; -use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// Semaphore limiting concurrent background tasks (across all tenants). /// @@ -295,48 +292,12 @@ pub(crate) fn log_compaction_error( task_cancelled: bool, degrade_to_warning: bool, ) { - use CompactionError::*; + let is_cancel = err.is_cancel(CheckOtherForCancel::Yes); - use crate::tenant::PageReconstructError; - use crate::tenant::upload_queue::NotInitialized; - - let level = match err { - e if e.is_cancel() => return, - ShuttingDown => return, - Offload(_) => Level::ERROR, - AlreadyRunning(_) => Level::ERROR, - CollectKeySpaceError(_) => Level::ERROR, - _ if task_cancelled => Level::INFO, - Other(err) => { - let root_cause = err.root_cause(); - - let upload_queue = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - let timeline = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_stopping()); - let buffered_writer_flush_task_canelled = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let write_blob_cancelled = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let gate_closed = root_cause - .downcast_ref::() - .is_some_and(|e| e.is_cancel()); - let is_stopping = upload_queue - || timeline - || buffered_writer_flush_task_canelled - || write_blob_cancelled - || gate_closed; - - if is_stopping { - Level::INFO - } else { - Level::ERROR - } - } + let level = if is_cancel || task_cancelled { + Level::INFO + } else { + Level::ERROR }; if let Some((error_count, sleep_duration)) = retry_info { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a6f6aa426f..f3f3261133 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -75,7 +75,7 @@ use utils::postgres_client::PostgresClientProtocol; use utils::rate_limit::RateLimit; use utils::seqwait::SeqWait; use utils::simple_rcu::{Rcu, RcuReadGuard}; -use utils::sync::gate::{Gate, GateGuard}; +use utils::sync::gate::{Gate, GateError, GateGuard}; use utils::{completion, critical, fs_ext, pausable_failpoint}; use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; @@ -116,6 +116,7 @@ use crate::pgdatadir_mapping::{ MAX_AUX_FILE_V2_DELTAS, MetricsUpdate, }; use crate::task_mgr::TaskKind; +use crate::tenant::blob_io::WriteBlobError; use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; use crate::tenant::layer_map::LayerMap; @@ -130,6 +131,7 @@ use crate::tenant::storage_layer::{ }; use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::timeline::logical_size::CurrentLogicalSize; +use crate::virtual_file::owned_buffers_io::write::FlushTaskError; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use crate::walingest::WalLagCooldown; use crate::walredo::RedoAttemptType; @@ -2061,9 +2063,10 @@ impl Timeline { }; // Signal compaction failure to avoid L0 flush stalls when it's broken. + // XXX this looks an awful lot like the circuit breaker code? Can we dedupe classification? match &result { Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed), - Err(e) if e.is_cancel() => {} + Err(e) if e.is_cancel(CheckOtherForCancel::No /* XXX flip this to Yes so that all the Other() errors that are cancel don't trip the circuit breaker? */) => {} Err(CompactionError::ShuttingDown) => { // Covered by the `Err(e) if e.is_cancel()` branch. } @@ -5935,19 +5938,61 @@ pub(crate) enum CompactionError { AlreadyRunning(&'static str), } +/// Whether [`CompactionError::is_cancel`] should inspect the +/// [`CompactionError::Other`] anyhow Error's root cause for +/// typical causes of cancellation. +pub(crate) enum CheckOtherForCancel { + No, + Yes, +} + impl CompactionError { /// Errors that can be ignored, i.e., cancel and shutdown. - pub fn is_cancel(&self) -> bool { - matches!( + pub fn is_cancel(&self, check_other: CheckOtherForCancel) -> bool { + if matches!( self, Self::ShuttingDown - | Self::AlreadyRunning(_) + | Self::AlreadyRunning(_) // XXX why do we treat AlreadyRunning as cancel? | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( PageReconstructError::Cancelled )) | Self::Offload(OffloadError::Cancelled) - ) + ) { + return true; + } + + let root_cause = match &check_other { + CheckOtherForCancel::No => return false, + CheckOtherForCancel::Yes => { + if let Self::Other(other) = self { + other.root_cause() + } else { + return false; + } + } + }; + + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + let buffered_writer_flush_task_canelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let write_blob_cancelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let gate_closed = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + upload_queue + || timeline + || buffered_writer_flush_task_canelled + || write_blob_cancelled + || gate_closed } /// Critical errors that indicate data corruption. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 7f5800d754..efb40f7ed9 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -11,9 +11,9 @@ use std::time::{Duration, Instant}; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, DurationRecorder, - GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, - Timeline, + CheckOtherForCancel, CompactFlags, CompactOptions, CompactionError, CreateImageLayersError, + DurationRecorder, GetVectoredError, ImageLayerCreationMode, LastImageLayerCreationStatus, + RecordedDuration, Timeline, }; use crate::tenant::timeline::DeltaEntry; @@ -1396,7 +1396,7 @@ impl Timeline { // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} - Err(err) if err.is_cancel() => {} + Err(err) if err.is_cancel(CheckOtherForCancel::No) => {} // Alert on critical errors that indicate data corruption. Err(err) if err.is_critical() => {