From f4cfa725b8ab67fafba2aef8761549c0cc010147 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 6 Feb 2025 11:30:27 +0100 Subject: [PATCH] pageserver: add a few critical errors (#10657) ## Problem Following #10641, let's add a few critical errors. Resolves #10094. ## Summary of changes Adds the following critical errors: * WAL sender read/decode failure. * WAL record ingestion failure. * WAL redo failure. * Missing key during compaction. We don't add an error for missing keys during GetPage requests, since we've seen a handful of these in production recently, and the cause is still unclear (most likely a benign race). --- libs/utils/src/logging.rs | 9 ++++++--- pageserver/src/tenant/timeline.rs | 8 +++++--- pageserver/src/tenant/timeline/compaction.rs | 15 ++++++++++++--- .../walreceiver/walreceiver_connection.rs | 16 +++++++++++++++- safekeeper/src/send_interpreted_wal.rs | 19 +++++++++---------- 5 files changed, 47 insertions(+), 20 deletions(-) diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 753f05b6fd..4a6069294d 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames}; /// Logs a critical error, similarly to `tracing::error!`. This will: /// /// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace. +/// * Trigger a pageable alert (via the metric below). /// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error". -/// * Trigger a pageable alert (via the metric above). /// * In debug builds, panic the process. +/// +/// When including errors in the message, please use {err:?} to include the error cause and original +/// backtrace. #[macro_export] macro_rules! critical { - ($($arg:tt)*) => { + ($($arg:tt)*) => {{ if cfg!(debug_assertions) { panic!($($arg)*); } $crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical(); let backtrace = std::backtrace::Backtrace::capture(); tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*)); - }; + }}; } #[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)] diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 45ddd38f67..908356c459 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -52,6 +52,7 @@ use tokio::{ }; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::critical; use utils::rate_limit::RateLimit; use utils::{ fs_ext, @@ -5807,10 +5808,11 @@ impl Timeline { let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), - Err(walredo::Error::Other(e)) => { + Err(walredo::Error::Other(err)) => { + critical!("walredo failure during page reconstruction: {err:?}"); return Err(PageReconstructError::WalRedo( - e.context("reconstruct a page image"), - )) + err.context("reconstruct a page image"), + )); } }; Ok(img) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index cfde070442..b9f4954453 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -10,8 +10,8 @@ use std::sync::Arc; use super::layer_manager::LayerManager; use super::{ - CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode, - LastImageLayerCreationStatus, RecordedDuration, Timeline, + CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError, + ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline, }; use anyhow::{anyhow, bail, Context}; @@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; use serde::Serialize; use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, trace, warn, Instrument}; +use utils::critical; use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; @@ -748,7 +749,15 @@ impl Timeline { .as_ref() .clone(), ) - .await?; + .await + .inspect_err(|err| { + if let CreateImageLayersError::GetVectoredError( + GetVectoredError::MissingKey(_), + ) = err + { + critical!("missing key during compaction: {err:?}"); + } + })?; self.last_image_layer_creation_status .store(Arc::new(outcome.clone())); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index de917377cb..23db4f88d2 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -39,7 +39,7 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; +use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol}; use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. @@ -393,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection( .await .with_context(|| { format!("could not ingest record at {local_next_record_lsn}") + }) + .inspect_err(|err| { + // TODO: we can't differentiate cancellation errors with + // anyhow::Error, so just ignore it if we're cancelled. + if !cancellation.is_cancelled() { + critical!("{err:?}") + } })?; uncommitted_records += 1; @@ -520,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection( .await .with_context(|| { format!("could not ingest record at {next_record_lsn}") + }) + .inspect_err(|err| { + // TODO: we can't differentiate cancellation errors with + // anyhow::Error, so just ignore it if we're cancelled. + if !cancellation.is_cancelled() { + critical!("{err:?}") + } })?; if !ingested { tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}"); diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs index b57cc8001d..5916675c3f 100644 --- a/safekeeper/src/send_interpreted_wal.rs +++ b/safekeeper/src/send_interpreted_wal.rs @@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tokio::sync::mpsc::error::SendError; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; -use tracing::{info_span, Instrument}; +use tracing::{error, info, info_span, Instrument}; +use utils::critical; use utils::lsn::Lsn; use utils::postgres_client::Compression; use utils::postgres_client::InterpretedFormat; @@ -213,11 +214,10 @@ impl InterpretedWalReader { metric.dec(); } - let res = reader.run_impl(start_pos).await; - if let Err(ref err) = res { - tracing::error!("Task finished with error: {err}"); - } - res + reader + .run_impl(start_pos) + .await + .inspect_err(|err| critical!("failed to read WAL record: {err:?}")) } .instrument(info_span!("interpreted wal reader")), ); @@ -273,11 +273,10 @@ impl InterpretedWalReader { metric.dec(); } - let res = self.run_impl(start_pos).await; - if let Err(err) = res { - tracing::error!("Interpreted wal reader encountered error: {err}"); + if let Err(err) = self.run_impl(start_pos).await { + critical!("failed to read WAL record: {err:?}"); } else { - tracing::info!("Interpreted wal reader exiting"); + info!("interpreted wal reader exiting"); } Err(CopyStreamHandlerEnd::Other(anyhow!(