From 5650138532e5e3ca4a06f15f3ed79d6a3545f5a0 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 2 Nov 2023 09:14:26 +0000 Subject: [PATCH] pageserver: helpers for explicitly dying on fatal I/O errors (#5651) Following from discussion on https://github.com/neondatabase/neon/pull/5436 where hacking an implicit die-on-fatal-io behavior into an Error type was a source of disagreement -- in this PR, dying on fatal I/O errors is explicit, with `fatal_err` and `maybe_fatal_err` helpers in the `MaybeFatalIo` trait, which is implemented for std::io::Result. To enable this approach with `crashsafe_overwrite`, the return type of that function is changed to std::io::Result -- the previous error enum for this function was not used for any logic, and the utility of saying exactly which step in the function failed is outweighed by the hygiene of having an I/O funciton return an io::Result. The initial use case for these helpers is the deletion queue. --- pageserver/src/deletion_queue.rs | 6 +- pageserver/src/deletion_queue/list_writer.rs | 31 ++--- pageserver/src/deletion_queue/validator.rs | 14 +- pageserver/src/virtual_file.rs | 135 +++++++++++-------- 4 files changed, 104 insertions(+), 82 deletions(-) diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 22efa23f10..7b2db929fa 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; +use crate::virtual_file::MaybeFatalIo; use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; @@ -271,7 +272,9 @@ impl DeletionHeader { let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) .await - .map_err(Into::into) + .maybe_fatal_err("save deletion header")?; + + Ok(()) } } @@ -360,6 +363,7 @@ impl DeletionList { let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) .await + .maybe_fatal_err("save deletion list") .map_err(Into::into) } } diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 6727957b2a..28daae2da5 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::storage_layer::LayerFileName; +use crate::virtual_file::on_fatal_io_error; +use crate::virtual_file::MaybeFatalIo; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order @@ -195,7 +197,7 @@ impl ListWriter { debug!("Deletion header {header_path} not found, first start?"); Ok(None) } else { - Err(anyhow::anyhow!(e)) + on_fatal_io_error(&e, "reading deletion header"); } } } @@ -216,16 +218,9 @@ impl ListWriter { self.pending.sequence = validated_sequence + 1; let deletion_directory = self.conf.deletion_prefix(); - let mut dir = match tokio::fs::read_dir(&deletion_directory).await { - Ok(d) => d, - Err(e) => { - warn!("Failed to open deletion list directory {deletion_directory}: {e:#}"); - - // Give up: if we can't read the deletion list directory, we probably can't - // write lists into it later, so the queue won't work. - return Err(e.into()); - } - }; + let mut dir = tokio::fs::read_dir(&deletion_directory) + .await + .fatal_err("read deletion directory"); let list_name_pattern = Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap(); @@ -233,7 +228,7 @@ impl ListWriter { let temp_extension = format!(".{TEMP_SUFFIX}"); let header_path = self.conf.deletion_header_path(); let mut seqs: Vec = Vec::new(); - while let Some(dentry) = dir.next_entry().await? { + while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") { let file_name = dentry.file_name(); let dentry_str = file_name.to_string_lossy(); @@ -246,11 +241,9 @@ impl ListWriter { info!("Cleaning up temporary file {dentry_str}"); let absolute_path = deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path")); - if let Err(e) = tokio::fs::remove_file(&absolute_path).await { - // Non-fatal error: we will just leave the file behind but not - // try and load it. - warn!("Failed to clean up temporary file {absolute_path}: {e:#}"); - } + tokio::fs::remove_file(&absolute_path) + .await + .fatal_err("delete temp file"); continue; } @@ -290,7 +283,9 @@ impl ListWriter { for s in seqs { let list_path = self.conf.deletion_list_path(s); - let list_bytes = tokio::fs::read(&list_path).await?; + let list_bytes = tokio::fs::read(&list_path) + .await + .fatal_err("read deletion list"); let mut deletion_list = match serde_json::from_slice::(&list_bytes) { Ok(l) => l, diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index a2cbfb9dc7..72bdbdefd6 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -28,6 +28,7 @@ use crate::config::PageServerConf; use crate::control_plane_client::ControlPlaneGenerationsApi; use crate::control_plane_client::RetryForeverError; use crate::metrics; +use crate::virtual_file::MaybeFatalIo; use super::deleter::DeleterMessage; use super::DeletionHeader; @@ -287,16 +288,9 @@ where async fn cleanup_lists(&mut self, list_paths: Vec) { for list_path in list_paths { debug!("Removing deletion list {list_path}"); - - if let Err(e) = tokio::fs::remove_file(&list_path).await { - // Unexpected: we should have permissions and nothing else should - // be touching these files. We will leave the file behind. Subsequent - // pageservers will try and load it again: hopefully whatever storage - // issue (probably permissions) has been fixed by then. - tracing::error!("Failed to delete {list_path}: {e:#}"); - metrics::DELETION_QUEUE.unexpected_errors.inc(); - break; - } + tokio::fs::remove_file(&list_path) + .await + .fatal_err("remove deletion list"); } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a2e8f30e15..b58b883ab6 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -19,6 +19,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom}; use std::os::unix::fs::FileExt; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{RwLock, RwLockWriteGuard}; +use utils::fs_ext; /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally @@ -173,37 +174,78 @@ impl OpenFiles { } } -#[derive(Debug, thiserror::Error)] -pub enum CrashsafeOverwriteError { - #[error("final path has no parent dir")] - FinalPathHasNoParentDir, - #[error("remove tempfile")] - RemovePreviousTempfile(#[source] std::io::Error), - #[error("create tempfile")] - CreateTempfile(#[source] std::io::Error), - #[error("write tempfile")] - WriteContents(#[source] std::io::Error), - #[error("sync tempfile")] - SyncTempfile(#[source] std::io::Error), - #[error("rename tempfile to final path")] - RenameTempfileToFinalPath(#[source] std::io::Error), - #[error("open final path parent dir")] - OpenFinalPathParentDir(#[source] std::io::Error), - #[error("sync final path parent dir")] - SyncFinalPathParentDir(#[source] std::io::Error), +/// Identify error types that should alwways terminate the process. Other +/// error types may be elegible for retry. +pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { + use nix::errno::Errno::*; + match e.raw_os_error().map(nix::errno::from_i32) { + Some(EIO) => { + // Terminate on EIO because we no longer trust the device to store + // data safely, or to uphold persistence guarantees on fsync. + true + } + Some(EROFS) => { + // Terminate on EROFS because a filesystem is usually remounted + // readonly when it has experienced some critical issue, so the same + // logic as EIO applies. + true + } + Some(EACCES) => { + // Terminate on EACCESS because we should always have permissions + // for our own data dir: if we don't, then we can't do our job and + // need administrative intervention to fix permissions. Terminating + // is the best way to make sure we stop cleanly rather than going + // into infinite retry loops, and will make it clear to the outside + // world that we need help. + true + } + _ => { + // Treat all other local file I/O errors are retryable. This includes: + // - ENOSPC: we stay up and wait for eviction to free some space + // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue + // - WriteZero, Interrupted: these are used internally VirtualFile + false + } + } } -impl CrashsafeOverwriteError { - /// Returns true iff the new contents are durably stored. - pub fn are_new_contents_durable(&self) -> bool { + +/// Call this when the local filesystem gives us an error with an external +/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either +/// bad storage or bad configuration, and we can't fix that from inside +/// a running process. +pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! { + tracing::error!("Fatal I/O error: {e}: {context})"); + std::process::abort(); +} + +pub(crate) trait MaybeFatalIo { + fn maybe_fatal_err(self, context: &str) -> std::io::Result; + fn fatal_err(self, context: &str) -> T; +} + +impl MaybeFatalIo for std::io::Result { + /// Terminate the process if the result is an error of a fatal type, else pass it through + /// + /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but + /// not on ENOSPC. + fn maybe_fatal_err(self, context: &str) -> std::io::Result { + if let Err(e) = &self { + if is_fatal_io_error(e) { + on_fatal_io_error(e, context); + } + } + self + } + + /// Terminate the process on any I/O error. + /// + /// This is appropriate for reads on files that we know exist: they should always work. + fn fatal_err(self, context: &str) -> T { match self { - Self::FinalPathHasNoParentDir => false, - Self::RemovePreviousTempfile(_) => false, - Self::CreateTempfile(_) => false, - Self::WriteContents(_) => false, - Self::SyncTempfile(_) => false, - Self::RenameTempfileToFinalPath(_) => false, - Self::OpenFinalPathParentDir(_) => false, - Self::SyncFinalPathParentDir(_) => true, + Ok(v) => v, + Err(e) => { + on_fatal_io_error(&e, context); + } } } } @@ -284,15 +326,13 @@ impl VirtualFile { final_path: &Utf8Path, tmp_path: &Utf8Path, content: &[u8], - ) -> Result<(), CrashsafeOverwriteError> { + ) -> std::io::Result<()> { let Some(final_path_parent) = final_path.parent() else { - return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir); + return Err(std::io::Error::from_raw_os_error( + nix::errno::Errno::EINVAL as i32, + )); }; - match std::fs::remove_file(tmp_path) { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} - Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)), - } + std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?; let mut file = Self::open_with_options( tmp_path, OpenOptions::new() @@ -301,31 +341,20 @@ impl VirtualFile { // we bail out instead of causing damage. .create_new(true), ) - .await - .map_err(CrashsafeOverwriteError::CreateTempfile)?; - file.write_all(content) - .await - .map_err(CrashsafeOverwriteError::WriteContents)?; - file.sync_all() - .await - .map_err(CrashsafeOverwriteError::SyncTempfile)?; + .await?; + file.write_all(content).await?; + file.sync_all().await?; drop(file); // before the rename, that's important! // renames are atomic - std::fs::rename(tmp_path, final_path) - .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?; + std::fs::rename(tmp_path, final_path)?; // Only open final path parent dirfd now, so that this operation only // ever holds one VirtualFile fd at a time. That's important because // the current `find_victim_slot` impl might pick the same slot for both // VirtualFile., and it eventually does a blocking write lock instead of // try_lock. let final_parent_dirfd = - Self::open_with_options(final_path_parent, OpenOptions::new().read(true)) - .await - .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?; - final_parent_dirfd - .sync_all() - .await - .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?; + Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?; + final_parent_dirfd.sync_all().await?; Ok(()) }