mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-22 21:59:59 +00:00
feat(pageserver): dump read path on missing key error (#10528)
## Problem helps investigate https://github.com/neondatabase/neon/issues/10482 ## Summary of changes In debug mode and testing mode, we will record all files visited by a read operation, and print it out when it errors. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -122,6 +122,7 @@ pub struct ConfigToml {
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -512,6 +513,11 @@ impl Default for ConfigToml {
|
||||
} else {
|
||||
GetVectoredConcurrentIo::SidecarTask
|
||||
},
|
||||
enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
|
||||
Some(true)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,6 +197,10 @@ pub struct PageServerConf {
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
|
||||
/// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
|
||||
/// files read.
|
||||
pub enable_read_path_debugging: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -360,6 +364,7 @@ impl PageServerConf {
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -446,6 +451,7 @@ impl PageServerConf {
|
||||
.unwrap_or_default(),
|
||||
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -44,7 +44,7 @@ pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
|
||||
use self::inmemory_layer::InMemoryLayerFileId;
|
||||
|
||||
use super::timeline::GetVectoredError;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use super::PageReconstructError;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
@@ -262,6 +262,8 @@ pub(crate) struct ValuesReconstructState {
|
||||
|
||||
pub(crate) io_concurrency: IoConcurrency,
|
||||
num_active_ios: Arc<AtomicUsize>,
|
||||
|
||||
pub(crate) read_path: Option<ReadPath>,
|
||||
}
|
||||
|
||||
/// The level of IO concurrency to be used on the read path
|
||||
@@ -609,6 +611,7 @@ impl ValuesReconstructState {
|
||||
delta_layers_visited: 0,
|
||||
io_concurrency,
|
||||
num_active_ios: Arc::new(AtomicUsize::new(0)),
|
||||
read_path: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -626,6 +626,71 @@ impl From<layer_manager::Shutdown> for GetVectoredError {
|
||||
}
|
||||
}
|
||||
|
||||
/// A layer identifier when used in the [`ReadPath`] structure. This enum is for observability purposes
|
||||
/// only and not used by the "real read path".
|
||||
pub enum ReadPathLayerId {
|
||||
PersistentLayer(PersistentLayerKey),
|
||||
InMemoryLayer(Range<Lsn>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ReadPathLayerId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
ReadPathLayerId::PersistentLayer(key) => write!(f, "{}", key),
|
||||
ReadPathLayerId::InMemoryLayer(range) => {
|
||||
write!(f, "in-mem {}..{}", range.start, range.end)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub struct ReadPath {
|
||||
keyspace: KeySpace,
|
||||
lsn: Lsn,
|
||||
path: Vec<(ReadPathLayerId, KeySpace, Range<Lsn>)>,
|
||||
}
|
||||
|
||||
impl ReadPath {
|
||||
pub fn new(keyspace: KeySpace, lsn: Lsn) -> Self {
|
||||
Self {
|
||||
keyspace,
|
||||
lsn,
|
||||
path: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_layer_visit(
|
||||
&mut self,
|
||||
layer_to_read: &ReadableLayer,
|
||||
keyspace_to_read: &KeySpace,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) {
|
||||
let id = match layer_to_read {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
ReadPathLayerId::PersistentLayer(layer.layer_desc().key())
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
ReadPathLayerId::InMemoryLayer(layer.get_lsn_range())
|
||||
}
|
||||
};
|
||||
self.path
|
||||
.push((id, keyspace_to_read.clone(), lsn_range.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ReadPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
writeln!(f, "Read path for {} at lsn {}:", self.keyspace, self.lsn)?;
|
||||
for (idx, (layer_id, keyspace, lsn_range)) in self.path.iter().enumerate() {
|
||||
writeln!(
|
||||
f,
|
||||
"{}: {} {}..{} {}",
|
||||
idx, layer_id, lsn_range.start, lsn_range.end, keyspace
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub struct MissingKeyError {
|
||||
key: Key,
|
||||
@@ -633,6 +698,8 @@ pub struct MissingKeyError {
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
/// Debug information about the read path if there's an error
|
||||
read_path: Option<ReadPath>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
@@ -649,10 +716,15 @@ impl std::fmt::Display for MissingKeyError {
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
|
||||
if let Some(ref read_path) = self.read_path {
|
||||
write!(f, "\n{}", read_path)?;
|
||||
}
|
||||
|
||||
if let Some(ref backtrace) = self.backtrace {
|
||||
write!(f, "\n{}", backtrace)?;
|
||||
}
|
||||
@@ -1069,6 +1141,7 @@ impl Timeline {
|
||||
request_lsn: lsn,
|
||||
ancestor_lsn: None,
|
||||
backtrace: None,
|
||||
read_path: None,
|
||||
})),
|
||||
}
|
||||
}
|
||||
@@ -1195,6 +1268,13 @@ impl Timeline {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
|
||||
let read_path = if self.conf.enable_read_path_debugging {
|
||||
Some(ReadPath::new(keyspace.clone(), lsn))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let traversal_res: Result<(), _> = self
|
||||
.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
.await;
|
||||
@@ -3504,6 +3584,7 @@ impl Timeline {
|
||||
request_lsn,
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
backtrace: None,
|
||||
read_path: std::mem::take(&mut reconstruct_state.read_path),
|
||||
}));
|
||||
}
|
||||
|
||||
@@ -3622,6 +3703,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
|
||||
if let Some(ref mut read_path) = reconstruct_state.read_path {
|
||||
read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
|
||||
}
|
||||
let next_cont_lsn = lsn_range.start;
|
||||
layer_to_read
|
||||
.get_values_reconstruct_data(
|
||||
|
||||
Reference in New Issue
Block a user