mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-10 06:52:55 +00:00
pageserver: add a few critical errors (#10657)
## Problem Following #10641, let's add a few critical errors. Resolves #10094. ## Summary of changes Adds the following critical errors: * WAL sender read/decode failure. * WAL record ingestion failure. * WAL redo failure. * Missing key during compaction. We don't add an error for missing keys during GetPage requests, since we've seen a handful of these in production recently, and the cause is still unclear (most likely a benign race).
This commit is contained in:
@@ -8,19 +8,22 @@ use strum_macros::{EnumString, VariantNames};
|
||||
/// Logs a critical error, similarly to `tracing::error!`. This will:
|
||||
///
|
||||
/// * Emit an ERROR log message with prefix "CRITICAL:" and a backtrace.
|
||||
/// * Trigger a pageable alert (via the metric below).
|
||||
/// * Increment libmetrics_tracing_event_count{level="critical"}, and indirectly level="error".
|
||||
/// * Trigger a pageable alert (via the metric above).
|
||||
/// * In debug builds, panic the process.
|
||||
///
|
||||
/// When including errors in the message, please use {err:?} to include the error cause and original
|
||||
/// backtrace.
|
||||
#[macro_export]
|
||||
macro_rules! critical {
|
||||
($($arg:tt)*) => {
|
||||
($($arg:tt)*) => {{
|
||||
if cfg!(debug_assertions) {
|
||||
panic!($($arg)*);
|
||||
}
|
||||
$crate::logging::TRACING_EVENT_COUNT_METRIC.inc_critical();
|
||||
let backtrace = std::backtrace::Backtrace::capture();
|
||||
tracing::error!("CRITICAL: {}\n{backtrace}", format!($($arg)*));
|
||||
};
|
||||
}};
|
||||
}
|
||||
|
||||
#[derive(EnumString, strum_macros::Display, VariantNames, Eq, PartialEq, Debug, Clone, Copy)]
|
||||
|
||||
@@ -52,6 +52,7 @@ use tokio::{
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::critical;
|
||||
use utils::rate_limit::RateLimit;
|
||||
use utils::{
|
||||
fs_ext,
|
||||
@@ -5807,10 +5808,11 @@ impl Timeline {
|
||||
let img = match res {
|
||||
Ok(img) => img,
|
||||
Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
|
||||
Err(walredo::Error::Other(e)) => {
|
||||
Err(walredo::Error::Other(err)) => {
|
||||
critical!("walredo failure during page reconstruction: {err:?}");
|
||||
return Err(PageReconstructError::WalRedo(
|
||||
e.context("reconstruct a page image"),
|
||||
))
|
||||
err.context("reconstruct a page image"),
|
||||
));
|
||||
}
|
||||
};
|
||||
Ok(img)
|
||||
|
||||
@@ -10,8 +10,8 @@ use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
LastImageLayerCreationStatus, RecordedDuration, Timeline,
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
|
||||
ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
@@ -26,6 +26,7 @@ use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
|
||||
use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, info_span, trace, warn, Instrument};
|
||||
use utils::critical;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
@@ -748,7 +749,15 @@ impl Timeline {
|
||||
.as_ref()
|
||||
.clone(),
|
||||
)
|
||||
.await?;
|
||||
.await
|
||||
.inspect_err(|err| {
|
||||
if let CreateImageLayersError::GetVectoredError(
|
||||
GetVectoredError::MissingKey(_),
|
||||
) = err
|
||||
{
|
||||
critical!("missing key during compaction: {err:?}");
|
||||
}
|
||||
})?;
|
||||
|
||||
self.last_image_layer_creation_status
|
||||
.store(Arc::new(outcome.clone()));
|
||||
|
||||
@@ -39,7 +39,7 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::{id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
|
||||
use utils::{critical, id::NodeId, lsn::Lsn, postgres_client::PostgresClientProtocol};
|
||||
use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
|
||||
|
||||
/// Status of the connection.
|
||||
@@ -393,6 +393,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {local_next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
|
||||
uncommitted_records += 1;
|
||||
@@ -520,6 +527,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
|
||||
|
||||
@@ -15,7 +15,8 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::sync::mpsc::error::SendError;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::MissedTickBehavior;
|
||||
use tracing::{info_span, Instrument};
|
||||
use tracing::{error, info, info_span, Instrument};
|
||||
use utils::critical;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::Compression;
|
||||
use utils::postgres_client::InterpretedFormat;
|
||||
@@ -213,11 +214,10 @@ impl InterpretedWalReader {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = reader.run_impl(start_pos).await;
|
||||
if let Err(ref err) = res {
|
||||
tracing::error!("Task finished with error: {err}");
|
||||
}
|
||||
res
|
||||
reader
|
||||
.run_impl(start_pos)
|
||||
.await
|
||||
.inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
|
||||
}
|
||||
.instrument(info_span!("interpreted wal reader")),
|
||||
);
|
||||
@@ -273,11 +273,10 @@ impl InterpretedWalReader {
|
||||
metric.dec();
|
||||
}
|
||||
|
||||
let res = self.run_impl(start_pos).await;
|
||||
if let Err(err) = res {
|
||||
tracing::error!("Interpreted wal reader encountered error: {err}");
|
||||
if let Err(err) = self.run_impl(start_pos).await {
|
||||
critical!("failed to read WAL record: {err:?}");
|
||||
} else {
|
||||
tracing::info!("Interpreted wal reader exiting");
|
||||
info!("interpreted wal reader exiting");
|
||||
}
|
||||
|
||||
Err(CopyStreamHandlerEnd::Other(anyhow!(
|
||||
|
||||
Reference in New Issue
Block a user