mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-06 13:02:55 +00:00
pageserver read path: abort on fatal IO errors from disk / filesystem (#10786)
Before this PR, an IO error returned from the kernel, e.g., due to a bad
disk, would get bubbled up, all the way to a user-visible query failing.
This is against the IO error handling policy where we have established
and is hence being rectified in this PR.
[[(internal Policy document
link)]](bef44149f7/src/storage/handling_io_and_logical_errors.md (L33-L35))
The practice on the write path seems to be that we call
`maybe_fatal_err()` or `fatal_err()` fairly high up the stack.
That is, regardless of whether std::fs, tokio::fs, or VirtualFile is
used to perform the IO.
For the read path, I choose a centralized approach in this PR by
checking for errors as close to the kernel interface as possible.
I believe this is better for long-term consistency.
To mitigate the problem of missing context if we abort so far down in
the stack, the `on_fatal_io_error` now captures and logs a backtrace.
I grepped the pageserver code base for `fs::read` to convince myself
that all non-VirtualFile reads already handle IO errors according to
policy.
Refs
- fixes https://github.com/neondatabase/neon/issues/10454
This commit is contained in:
committed by
GitHub
parent
6a741fd1c2
commit
487f3202fe
@@ -496,7 +496,8 @@ pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
|
|||||||
/// bad storage or bad configuration, and we can't fix that from inside
|
/// bad storage or bad configuration, and we can't fix that from inside
|
||||||
/// a running process.
|
/// a running process.
|
||||||
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
|
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
|
||||||
tracing::error!("Fatal I/O error: {e}: {context})");
|
let backtrace = std::backtrace::Backtrace::force_capture();
|
||||||
|
tracing::error!("Fatal I/O error: {e}: {context})\n{backtrace}");
|
||||||
std::process::abort();
|
std::process::abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -947,13 +948,18 @@ impl VirtualFileInner {
|
|||||||
where
|
where
|
||||||
Buf: tokio_epoll_uring::IoBufMut + Send,
|
Buf: tokio_epoll_uring::IoBufMut + Send,
|
||||||
{
|
{
|
||||||
let file_guard = match self.lock_file().await {
|
let file_guard = match self
|
||||||
|
.lock_file()
|
||||||
|
.await
|
||||||
|
.maybe_fatal_err("lock_file inside VirtualFileInner::read_at")
|
||||||
|
{
|
||||||
Ok(file_guard) => file_guard,
|
Ok(file_guard) => file_guard,
|
||||||
Err(e) => return (buf, Err(e)),
|
Err(e) => return (buf, Err(e)),
|
||||||
};
|
};
|
||||||
|
|
||||||
observe_duration!(StorageIoOperation::Read, {
|
observe_duration!(StorageIoOperation::Read, {
|
||||||
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
|
let ((_file_guard, buf), res) = io_engine::get().read_at(file_guard, offset, buf).await;
|
||||||
|
let res = res.maybe_fatal_err("io_engine read_at inside VirtualFileInner::read_at");
|
||||||
if let Ok(size) = res {
|
if let Ok(size) = res {
|
||||||
STORAGE_IO_SIZE
|
STORAGE_IO_SIZE
|
||||||
.with_label_values(&[
|
.with_label_values(&[
|
||||||
|
|||||||
Reference in New Issue
Block a user