From f9b05a42d7a408bc98f485463c13f58496101160 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Jul 2025 18:45:34 +0200 Subject: [PATCH] refactor(compaction): remove `CompactionError::AlreadyRunning` variant, use `::Other` instead (#12512) The only call stack that can emit the `::AlreadyRunning` variant is ``` -> iteration_inner -> iteration -> compaction_iteration -> compaction_loop -> start_background_loops ``` And on that call stack, the only differentiated handling of it is its invocations of `log_compaction_error -> CompactionError::is_cancel()`, which returns `true` for `::AlreadyRunning`. I think the condition of `AlreadyRunning` is severe; it really shouldn't happen. So, this PR starts treating it as something that is to be logged at `ERROR` / `WARN` level, depending on the `degrate_to_warning` argument to `log_compaction_error`. refs - https://databricks.atlassian.net/browse/LKB-182 --- pageserver/src/http/routes.rs | 3 +-- pageserver/src/tenant.rs | 1 - pageserver/src/tenant/tasks.rs | 1 - pageserver/src/tenant/timeline.rs | 6 ------ pageserver/src/tenant/timeline/compaction.rs | 8 ++++---- 5 files changed, 5 insertions(+), 14 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 23a090045a..55582659df 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2503,7 +2503,6 @@ async fn timeline_checkpoint_handler( CompactionError::ShuttingDown => ApiError::ShuttingDown, CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)), CompactionError::Other(e) => ApiError::InternalServerError(e), - CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)), } )?; } @@ -3697,7 +3696,7 @@ async fn tenant_evaluate_feature_flag( let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; - // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) + // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) // and we don't need to worry about it for now. let properties = tenant.feature_resolver.collect_properties(); if as_type.as_deref() == Some("boolean") { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ad767a1672..49b92915da 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3334,7 +3334,6 @@ impl TenantShard { .unwrap() .fail(&CIRCUIT_BREAKERS_BROKEN, err); } - CompactionError::AlreadyRunning(_) => {} } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 2ba1ad2674..356f495972 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error( let level = match err { e if e.is_cancel() => return, ShuttingDown => return, - AlreadyRunning(_) => Level::ERROR, CollectKeySpaceError(_) => Level::ERROR, _ if task_cancelled => Level::INFO, Other(err) => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c2b49c0296..296b922599 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2067,9 +2067,6 @@ impl Timeline { Err(CompactionError::ShuttingDown) => { // Covered by the `Err(e) if e.is_cancel()` branch. } - Err(CompactionError::AlreadyRunning(_)) => { - // Covered by the `Err(e) if e.is_cancel()` branch. - } Err(CompactionError::Other(_)) => { self.compaction_failed.store(true, AtomicOrdering::Relaxed) } @@ -6018,8 +6015,6 @@ pub(crate) enum CompactionError { CollectKeySpaceError(#[from] CollectKeySpaceError), #[error(transparent)] Other(anyhow::Error), - #[error("Compaction already running: {0}")] - AlreadyRunning(&'static str), } impl CompactionError { @@ -6028,7 +6023,6 @@ impl CompactionError { matches!( self, Self::ShuttingDown - | Self::AlreadyRunning(_) | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled) | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead( PageReconstructError::Cancelled diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index ac3930fb71..2c0b98c1e2 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -572,7 +572,7 @@ impl GcCompactionQueue { match res { Ok(res) => Ok(res), Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown), - Err(_) => { + Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => { // There are some cases where traditional gc might collect some layer // files causing gc-compaction cannot read the full history of the key. // This needs to be resolved in the long-term by improving the compaction @@ -591,9 +591,9 @@ impl GcCompactionQueue { timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { - return Err(CompactionError::AlreadyRunning( - "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.", - )); + return Err(CompactionError::Other(anyhow::anyhow!( + "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue." + ))); }; let has_pending_tasks; let mut yield_for_l0 = false;