diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 2b2fcc7711..5f39c46a84 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; +use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { ); error_run_count += 1; let wait_duration = Duration::from_secs_f64(wait_duration); - error!( - "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}", + log_compaction_error( + &e, + error_run_count, + &wait_duration, + cancel.is_cancelled(), ); wait_duration } else { @@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); } +fn log_compaction_error( + e: &CompactionError, + error_run_count: u32, + sleep_duration: &std::time::Duration, + task_cancelled: bool, +) { + use crate::tenant::upload_queue::NotInitialized; + use crate::tenant::PageReconstructError; + use CompactionError::*; + + enum LooksLike { + Info, + Error, + } + + let decision = match e { + ShuttingDown => None, + _ if task_cancelled => Some(LooksLike::Info), + Other(e) => { + let root_cause = e.root_cause(); + + let is_stopping = { + let upload_queue = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + + let timeline = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_stopping()); + + upload_queue || timeline + }; + + if is_stopping { + Some(LooksLike::Info) + } else { + Some(LooksLike::Error) + } + } + }; + + match decision { + Some(LooksLike::Info) => info!( + "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}", + ), + Some(LooksLike::Error) => error!( + "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}", + ), + None => {} + } +} + /// /// GC task's main loop /// diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 666dae94e2..c21fe94d01 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -392,8 +392,7 @@ pub(crate) enum PageReconstructError { #[error("Ancestor LSN wait error: {0}")] AncestorLsnTimeout(#[from] WaitLsnError), - /// The operation was cancelled - #[error("Cancelled")] + #[error("timeline shutting down")] Cancelled, /// The ancestor of this is being stopped @@ -405,6 +404,19 @@ pub(crate) enum PageReconstructError { WalRedo(anyhow::Error), } +impl PageReconstructError { + /// Returns true if this error indicates a tenant/timeline shutdown alike situation + pub(crate) fn is_stopping(&self) -> bool { + use PageReconstructError::*; + match self { + Other(_) => false, + AncestorLsnTimeout(_) => false, + Cancelled | AncestorStopping(_) => true, + WalRedo(_) => false, + } + } +} + #[derive(thiserror::Error, Debug)] enum CreateImageLayersError { #[error("timeline shutting down")] diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index 32f14f40c5..0b61bc0a10 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped { pub(super) deleted_at: SetDeletedFlagProgress, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum NotInitialized { + #[error("queue is in state Uninitialized")] + Uninitialized, + #[error("queue is in state Stopping")] + Stopped, + #[error("queue is shutting down")] + ShuttingDown, +} + +impl NotInitialized { + pub(crate) fn is_stopping(&self) -> bool { + use NotInitialized::*; + match self { + Uninitialized => false, + Stopped => true, + ShuttingDown => true, + } + } +} + impl UploadQueue { pub(crate) fn initialize_empty_remote( &mut self, @@ -214,17 +235,17 @@ impl UploadQueue { } pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> { + use UploadQueue::*; match self { - UploadQueue::Uninitialized | UploadQueue::Stopped(_) => { - anyhow::bail!("queue is in state {}", self.as_str()) - } - UploadQueue::Initialized(x) => { - if !x.shutting_down { - Ok(x) + Uninitialized => Err(NotInitialized::Uninitialized.into()), + Initialized(x) => { + if x.shutting_down { + Err(NotInitialized::ShuttingDown.into()) } else { - anyhow::bail!("queue is shutting down") + Ok(x) } } + Stopped(_) => Err(NotInitialized::Stopped.into()), } }