From 6a4f49b08bf8e96ad62fd9ac1550acf3e58d4179 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:35:33 -0400 Subject: [PATCH] fix(pageserver): passthrough partition cancel error (#9154) close https://github.com/neondatabase/neon/issues/9142 ## Summary of changes passthrough CollectKeyspaceError::Cancelled to CompactionError::ShuttingDown Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline.rs | 18 ++++++++++++++---- pageserver/src/tenant/timeline/compaction.rs | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index d301ba23ea..157c6ab91e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3601,7 +3601,7 @@ impl Timeline { ctx, ) .await - .map_err(|e| FlushLayerError::from_anyhow(self, e))?; + .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?; if self.cancel.is_cancelled() { return Err(FlushLayerError::Cancelled); @@ -3840,16 +3840,20 @@ impl Timeline { partition_size: u64, flags: EnumSet, ctx: &RequestContext, - ) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> { + ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> { let Ok(mut partitioning_guard) = self.partitioning.try_lock() else { // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline. // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()` // and hence before the compaction task starts. - anyhow::bail!("repartition() called concurrently, this should not happen"); + return Err(CompactionError::Other(anyhow!( + "repartition() called concurrently, this should not happen" + ))); }; let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard; if lsn < *partition_lsn { - anyhow::bail!("repartition() called with LSN going backwards, this should not happen"); + return Err(CompactionError::Other(anyhow!( + "repartition() called with LSN going backwards, this should not happen" + ))); } let distance = lsn.0 - partition_lsn.0; @@ -4451,6 +4455,12 @@ pub(crate) enum CompactionError { Other(anyhow::Error), } +impl CompactionError { + pub fn is_cancelled(&self) -> bool { + matches!(self, CompactionError::ShuttingDown) + } +} + impl From for CompactionError { fn from(err: CollectKeySpaceError) -> Self { match err { diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 6edc28a11b..3de386a2d5 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -390,7 +390,7 @@ impl Timeline { // error but continue. // // Suppress error when it's due to cancellation - if !self.cancel.is_cancelled() { + if !self.cancel.is_cancelled() && !err.is_cancelled() { tracing::error!("could not compact, repartitioning keyspace failed: {err:?}"); } (1, false)