From e33e1094031578b384f448876af0048b14421b50 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 27 Jun 2025 17:26:00 +0200 Subject: [PATCH] fix(pageserver): buffered writer cancellation error handling (#12376) ## Problem The problem has been well described in already-commited PR #11853. tl;dr: BufferedWriter is sensitive to cancellation, which the previous approach was not. The write path was most affected (ingest & compaction), which was mostly fixed in #11853: it introduced `PutError` and mapped instances of `PutError` that were due to cancellation of underlying buffered writer into `CreateImageLayersError::Cancelled`. However, there is a long tail of remaining errors that weren't caught by #11853 that result in `CompactionError::Other`s, which we log with great noise. ## Solution The stack trace logging for CompactionError::Other added in #11853 allows us to chop away at that long tail using the following pattern: - look at the stack trace - from leaf up, identify the place where we incorrectly map from the distinguished variant X indicating cancellation to an `anyhow::Error` - follow that anyhow further up, ensuring it stays the same anyhow all the way up in the `CompactionError::Other` - since it stayed one anyhow chain all the way up, root_cause() will yield us X - so, in `log_compaction_error`, add an additional `downcast_ref` check for X This PR specifically adds checks for - the flush task cancelling (FlushTaskError, BlobWriterError) - opening of the layer writer (GateError) That should cover all the reports in issues - https://github.com/neondatabase/cloud/issues/29434 - https://github.com/neondatabase/neon/issues/12162 ## Refs - follow-up to #11853 - fixup of / fixes https://github.com/neondatabase/neon/issues/11762 - fixes https://github.com/neondatabase/neon/issues/12162 - refs https://github.com/neondatabase/cloud/issues/29434 --- libs/utils/src/sync/gate.rs | 8 ++++++++ pageserver/src/tenant/tasks.rs | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 93460785bf..862b2cff9e 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -86,6 +86,14 @@ pub enum GateError { GateClosed, } +impl GateError { + pub fn is_cancel(&self) -> bool { + match self { + GateError::GateClosed => true, + } + } +} + impl Default for Gate { fn default() -> Self { Self { diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 4709a6d616..954dd38bb4 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -17,14 +17,17 @@ use tracing::*; use utils::backoff::exponential_backoff_duration; use utils::completion::Barrier; use utils::pausable_failpoint; +use utils::sync::gate::GateError; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS}; use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind}; +use crate::tenant::blob_io::WriteBlobError; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::compaction::CompactionOutcome; use crate::tenant::{TenantShard, TenantState}; +use crate::virtual_file::owned_buffers_io::write::FlushTaskError; /// Semaphore limiting concurrent background tasks (across all tenants). /// @@ -313,7 +316,20 @@ pub(crate) fn log_compaction_error( let timeline = root_cause .downcast_ref::() .is_some_and(|e| e.is_stopping()); - let is_stopping = upload_queue || timeline; + let buffered_writer_flush_task_canelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let write_blob_cancelled = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let gate_closed = root_cause + .downcast_ref::() + .is_some_and(|e| e.is_cancel()); + let is_stopping = upload_queue + || timeline + || buffered_writer_flush_task_canelled + || write_blob_cancelled + || gate_closed; if is_stopping { Level::INFO