diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 22efa23f10..7b2db929fa 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::remote_timeline_client::remote_timeline_path; +use crate::virtual_file::MaybeFatalIo; use crate::virtual_file::VirtualFile; use anyhow::Context; use camino::Utf8PathBuf; @@ -271,7 +272,9 @@ impl DeletionHeader { let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX); VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes) .await - .map_err(Into::into) + .maybe_fatal_err("save deletion header")?; + + Ok(()) } } @@ -360,6 +363,7 @@ impl DeletionList { let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list"); VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes) .await + .maybe_fatal_err("save deletion list") .map_err(Into::into) } } diff --git a/pageserver/src/deletion_queue/list_writer.rs b/pageserver/src/deletion_queue/list_writer.rs index 6727957b2a..28daae2da5 100644 --- a/pageserver/src/deletion_queue/list_writer.rs +++ b/pageserver/src/deletion_queue/list_writer.rs @@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX; use crate::metrics; use crate::tenant::remote_timeline_client::remote_layer_path; use crate::tenant::storage_layer::LayerFileName; +use crate::virtual_file::on_fatal_io_error; +use crate::virtual_file::MaybeFatalIo; // The number of keys in a DeletionList before we will proactively persist it // (without reaching a flush deadline). This aims to deliver objects of the order @@ -195,7 +197,7 @@ impl ListWriter { debug!("Deletion header {header_path} not found, first start?"); Ok(None) } else { - Err(anyhow::anyhow!(e)) + on_fatal_io_error(&e, "reading deletion header"); } } } @@ -216,16 +218,9 @@ impl ListWriter { self.pending.sequence = validated_sequence + 1; let deletion_directory = self.conf.deletion_prefix(); - let mut dir = match tokio::fs::read_dir(&deletion_directory).await { - Ok(d) => d, - Err(e) => { - warn!("Failed to open deletion list directory {deletion_directory}: {e:#}"); - - // Give up: if we can't read the deletion list directory, we probably can't - // write lists into it later, so the queue won't work. - return Err(e.into()); - } - }; + let mut dir = tokio::fs::read_dir(&deletion_directory) + .await + .fatal_err("read deletion directory"); let list_name_pattern = Regex::new("(?[a-zA-Z0-9]{16})-(?[a-zA-Z0-9]{2}).list").unwrap(); @@ -233,7 +228,7 @@ impl ListWriter { let temp_extension = format!(".{TEMP_SUFFIX}"); let header_path = self.conf.deletion_header_path(); let mut seqs: Vec = Vec::new(); - while let Some(dentry) = dir.next_entry().await? { + while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") { let file_name = dentry.file_name(); let dentry_str = file_name.to_string_lossy(); @@ -246,11 +241,9 @@ impl ListWriter { info!("Cleaning up temporary file {dentry_str}"); let absolute_path = deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path")); - if let Err(e) = tokio::fs::remove_file(&absolute_path).await { - // Non-fatal error: we will just leave the file behind but not - // try and load it. - warn!("Failed to clean up temporary file {absolute_path}: {e:#}"); - } + tokio::fs::remove_file(&absolute_path) + .await + .fatal_err("delete temp file"); continue; } @@ -290,7 +283,9 @@ impl ListWriter { for s in seqs { let list_path = self.conf.deletion_list_path(s); - let list_bytes = tokio::fs::read(&list_path).await?; + let list_bytes = tokio::fs::read(&list_path) + .await + .fatal_err("read deletion list"); let mut deletion_list = match serde_json::from_slice::(&list_bytes) { Ok(l) => l, diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index a2cbfb9dc7..72bdbdefd6 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -28,6 +28,7 @@ use crate::config::PageServerConf; use crate::control_plane_client::ControlPlaneGenerationsApi; use crate::control_plane_client::RetryForeverError; use crate::metrics; +use crate::virtual_file::MaybeFatalIo; use super::deleter::DeleterMessage; use super::DeletionHeader; @@ -287,16 +288,9 @@ where async fn cleanup_lists(&mut self, list_paths: Vec) { for list_path in list_paths { debug!("Removing deletion list {list_path}"); - - if let Err(e) = tokio::fs::remove_file(&list_path).await { - // Unexpected: we should have permissions and nothing else should - // be touching these files. We will leave the file behind. Subsequent - // pageservers will try and load it again: hopefully whatever storage - // issue (probably permissions) has been fixed by then. - tracing::error!("Failed to delete {list_path}: {e:#}"); - metrics::DELETION_QUEUE.unexpected_errors.inc(); - break; - } + tokio::fs::remove_file(&list_path) + .await + .fatal_err("remove deletion list"); } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index a2e8f30e15..b58b883ab6 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -19,6 +19,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom}; use std::os::unix::fs::FileExt; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::{RwLock, RwLockWriteGuard}; +use utils::fs_ext; /// /// A virtual file descriptor. You can use this just like std::fs::File, but internally @@ -173,37 +174,78 @@ impl OpenFiles { } } -#[derive(Debug, thiserror::Error)] -pub enum CrashsafeOverwriteError { - #[error("final path has no parent dir")] - FinalPathHasNoParentDir, - #[error("remove tempfile")] - RemovePreviousTempfile(#[source] std::io::Error), - #[error("create tempfile")] - CreateTempfile(#[source] std::io::Error), - #[error("write tempfile")] - WriteContents(#[source] std::io::Error), - #[error("sync tempfile")] - SyncTempfile(#[source] std::io::Error), - #[error("rename tempfile to final path")] - RenameTempfileToFinalPath(#[source] std::io::Error), - #[error("open final path parent dir")] - OpenFinalPathParentDir(#[source] std::io::Error), - #[error("sync final path parent dir")] - SyncFinalPathParentDir(#[source] std::io::Error), +/// Identify error types that should alwways terminate the process. Other +/// error types may be elegible for retry. +pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { + use nix::errno::Errno::*; + match e.raw_os_error().map(nix::errno::from_i32) { + Some(EIO) => { + // Terminate on EIO because we no longer trust the device to store + // data safely, or to uphold persistence guarantees on fsync. + true + } + Some(EROFS) => { + // Terminate on EROFS because a filesystem is usually remounted + // readonly when it has experienced some critical issue, so the same + // logic as EIO applies. + true + } + Some(EACCES) => { + // Terminate on EACCESS because we should always have permissions + // for our own data dir: if we don't, then we can't do our job and + // need administrative intervention to fix permissions. Terminating + // is the best way to make sure we stop cleanly rather than going + // into infinite retry loops, and will make it clear to the outside + // world that we need help. + true + } + _ => { + // Treat all other local file I/O errors are retryable. This includes: + // - ENOSPC: we stay up and wait for eviction to free some space + // - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue + // - WriteZero, Interrupted: these are used internally VirtualFile + false + } + } } -impl CrashsafeOverwriteError { - /// Returns true iff the new contents are durably stored. - pub fn are_new_contents_durable(&self) -> bool { + +/// Call this when the local filesystem gives us an error with an external +/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either +/// bad storage or bad configuration, and we can't fix that from inside +/// a running process. +pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! { + tracing::error!("Fatal I/O error: {e}: {context})"); + std::process::abort(); +} + +pub(crate) trait MaybeFatalIo { + fn maybe_fatal_err(self, context: &str) -> std::io::Result; + fn fatal_err(self, context: &str) -> T; +} + +impl MaybeFatalIo for std::io::Result { + /// Terminate the process if the result is an error of a fatal type, else pass it through + /// + /// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but + /// not on ENOSPC. + fn maybe_fatal_err(self, context: &str) -> std::io::Result { + if let Err(e) = &self { + if is_fatal_io_error(e) { + on_fatal_io_error(e, context); + } + } + self + } + + /// Terminate the process on any I/O error. + /// + /// This is appropriate for reads on files that we know exist: they should always work. + fn fatal_err(self, context: &str) -> T { match self { - Self::FinalPathHasNoParentDir => false, - Self::RemovePreviousTempfile(_) => false, - Self::CreateTempfile(_) => false, - Self::WriteContents(_) => false, - Self::SyncTempfile(_) => false, - Self::RenameTempfileToFinalPath(_) => false, - Self::OpenFinalPathParentDir(_) => false, - Self::SyncFinalPathParentDir(_) => true, + Ok(v) => v, + Err(e) => { + on_fatal_io_error(&e, context); + } } } } @@ -284,15 +326,13 @@ impl VirtualFile { final_path: &Utf8Path, tmp_path: &Utf8Path, content: &[u8], - ) -> Result<(), CrashsafeOverwriteError> { + ) -> std::io::Result<()> { let Some(final_path_parent) = final_path.parent() else { - return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir); + return Err(std::io::Error::from_raw_os_error( + nix::errno::Errno::EINVAL as i32, + )); }; - match std::fs::remove_file(tmp_path) { - Ok(()) => {} - Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} - Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)), - } + std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?; let mut file = Self::open_with_options( tmp_path, OpenOptions::new() @@ -301,31 +341,20 @@ impl VirtualFile { // we bail out instead of causing damage. .create_new(true), ) - .await - .map_err(CrashsafeOverwriteError::CreateTempfile)?; - file.write_all(content) - .await - .map_err(CrashsafeOverwriteError::WriteContents)?; - file.sync_all() - .await - .map_err(CrashsafeOverwriteError::SyncTempfile)?; + .await?; + file.write_all(content).await?; + file.sync_all().await?; drop(file); // before the rename, that's important! // renames are atomic - std::fs::rename(tmp_path, final_path) - .map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?; + std::fs::rename(tmp_path, final_path)?; // Only open final path parent dirfd now, so that this operation only // ever holds one VirtualFile fd at a time. That's important because // the current `find_victim_slot` impl might pick the same slot for both // VirtualFile., and it eventually does a blocking write lock instead of // try_lock. let final_parent_dirfd = - Self::open_with_options(final_path_parent, OpenOptions::new().read(true)) - .await - .map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?; - final_parent_dirfd - .sync_all() - .await - .map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?; + Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?; + final_parent_dirfd.sync_all().await?; Ok(()) }