pageserver: add a few critical errors (#10657)

## Problem

Following #10641, let's add a few critical errors.

Resolves #10094.

## Summary of changes

Adds the following critical errors:

* WAL sender read/decode failure.
* WAL record ingestion failure.
* WAL redo failure.
* Missing key during compaction.

We don't add an error for missing keys during GetPage requests, since
we've seen a handful of these in production recently, and the cause is
still unclear (most likely a benign race).
This commit is contained in:
Erik Grinaker
2025-02-06 11:30:27 +01:00
committed by GitHub
parent 05326cc247
commit f4cfa725b8
5 changed files with 47 additions and 20 deletions

View File

@@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames};
/// Logs a critical error, similarly to `tracing::error!`. This will:
///
/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
/// * Trigger a pageable alert (via the metric below).
/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
/// * Trigger a pageable alert (via the metric above).
/// * In debug builds, panic the process.
///
/// When including errors in the message, please use {err:?} to include the error cause and original
/// backtrace.
#[macro_export]
macro_rules! critical {
($($arg:tt)*) => {
($($arg:tt)*) => {{
if cfg!(debug_assertions) {
panic!($($arg)*);
}
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
let backtrace = std::backtrace::Backtrace::capture();
tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
};
}};
}
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]

View File

@@ -52,6 +52,7 @@ use tokio::{
};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::critical;
use utils::rate_limit::RateLimit;
use utils::{
fs_ext,
@@ -5807,10 +5808,11 @@ impl Timeline {
let img = match res {
Ok(img) => img,
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
Err(walredo::Error::Other(e)) => {
Err(walredo::Error::Other(err)) => {
critical!("walredo failure during page reconstruction: {err:?}");
return Err(PageReconstructError::WalRedo(
e.context("reconstruct a page image"),
))
err.context("reconstruct a page image"),
));
}
};
Ok(img)

View File

@@ -10,8 +10,8 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
LastImageLayerCreationStatus, RecordedDuration, Timeline,
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
};
use anyhow::{anyhow, bail, Context};
@@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
use serde::Serialize;
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, info_span, trace, warn, Instrument};
use utils::critical;
use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
@@ -748,7 +749,15 @@ impl Timeline {
.as_ref()
.clone(),
)
.await?;
.await
.inspect_err(|err| {
if let CreateImageLayersError::GetVectoredError(
GetVectoredError::MissingKey(_),
) = err
{
critical!("missing key during compaction: {err:?}");
}
})?;
self.last_image_layer_creation_status
.store(Arc::new(outcome.clone()));

View File

@@ -39,7 +39,7 @@ use crate::{
use postgres_backend::is_expected_io_error;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
/// Status of the connection.
@@ -393,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection(
.await
.with_context(|| {
format!("could not ingest record at {local_next_record_lsn}")
})
.inspect_err(|err| {
// TODO: we can't differentiate cancellation errors with
// anyhow::Error, so just ignore it if we're cancelled.
if !cancellation.is_cancelled() {
critical!("{err:?}")
}
})?;
uncommitted_records += 1;
@@ -520,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection(
.await
.with_context(|| {
format!("could not ingest record at {next_record_lsn}")
})
.inspect_err(|err| {
// TODO: we can't differentiate cancellation errors with
// anyhow::Error, so just ignore it if we're cancelled.
if !cancellation.is_cancelled() {
critical!("{err:?}")
}
})?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");

View File

@@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tokio::sync::mpsc::error::SendError;
use tokio::task::JoinHandle;
use tokio::time::MissedTickBehavior;
use tracing::{info_span, Instrument};
use tracing::{error, info, info_span, Instrument};
use utils::critical;
use utils::lsn::Lsn;
use utils::postgres_client::Compression;
use utils::postgres_client::InterpretedFormat;
@@ -213,11 +214,10 @@ impl InterpretedWalReader {
metric.dec();
}
let res = reader.run_impl(start_pos).await;
if let Err(ref err) = res {
tracing::error!("Task finished with error: {err}");
}
res
reader
.run_impl(start_pos)
.await
.inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
}
.instrument(info_span!("interpreted wal reader")),
);
@@ -273,11 +273,10 @@ impl InterpretedWalReader {
metric.dec();
}
let res = self.run_impl(start_pos).await;
if let Err(err) = res {
tracing::error!("Interpreted wal reader encountered error: {err}");
if let Err(err) = self.run_impl(start_pos).await {
critical!("failed to read WAL record: {err:?}");
} else {
tracing::info!("Interpreted wal reader exiting");
info!("interpreted wal reader exiting");
}
Err(CopyStreamHandlerEnd::Other(anyhow!(