mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-07 21:42:56 +00:00
pageserver read path: abort on fatal IO errors from disk / filesystem (#10786)
Before this PR, an IO error returned from the kernel, e.g., due to a bad
disk, would get bubbled up, all the way to a user-visible query failing.
This is against the IO error handling policy where we have established
and is hence being rectified in this PR.
[[(internal Policy document
link)]](bef44149f7/src/storage/handling_io_and_logical_errors.md (L33-L35))
The practice on the write path seems to be that we call
`maybe_fatal_err()` or `fatal_err()` fairly high up the stack.
That is, regardless of whether std::fs, tokio::fs, or VirtualFile is
used to perform the IO.
For the read path, I choose a centralized approach in this PR by
checking for errors as close to the kernel interface as possible.
I believe this is better for long-term consistency.
To mitigate the problem of missing context if we abort so far down in
the stack, the `on_fatal_io_error` now captures and logs a backtrace.
I grepped the pageserver code base for `fs::read` to convince myself
that all non-VirtualFile reads already handle IO errors according to
policy.
Refs
- fixes https://github.com/neondatabase/neon/issues/10454
This commit is contained in:
committed by
GitHub
parent
6a741fd1c2
commit
487f3202fe
@@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
|
||||
/// bad storage or bad configuration, and we can't fix that from inside
|
||||
/// a running process.
|
||||
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
|
||||
tracing::error!("Fatal I/O error: {e}: {context})");
|
||||
let backtrace = std::backtrace::Backtrace::force_capture();
|
||||
tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}");
|
||||
std::process::abort();
|
||||
}
|
||||
|
||||
@@ -947,13 +948,18 @@ impl VirtualFileInner {
|
||||
where
|
||||
Buf: tokio_epoll_uring::IoBufMut + Send,
|
||||
{
|
||||
let file_guard = match self.lock_file().await {
|
||||
let file_guard = match self
|
||||
.lock_file()
|
||||
.await
|
||||
.maybe_fatal_err("lock_file inside VirtualFileInner::read_at")
|
||||
{
|
||||
Ok(file_guard) => file_guard,
|
||||
Err(e) => return (buf, Err(e)),
|
||||
};
|
||||
|
||||
observe_duration!(StorageIoOperation::Read, {
|
||||
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
|
||||
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
|
||||
if let Ok(size) = res {
|
||||
STORAGE_IO_SIZE
|
||||
.with_label_values(&[
|
||||
|
||||
Reference in New Issue
Block a user