Compare commits

...

3 Commits

Author SHA1 Message Date
John Spray
8df507ccea pageserver: make I/O errors in deletion queue fatal 2023-10-02 14:28:04 +01:00
John Spray
cc2c1a8bf4 pageserver: add hook for terminating on I/O errors 2023-10-02 14:28:04 +01:00
John Spray
218b514498 pageserver: deletion queue nits 2023-10-02 11:41:10 +01:00
3 changed files with 47 additions and 12 deletions

View File

@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file;
use crate::virtual_file::on_fatal_io_error;
// The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order
@@ -199,7 +201,8 @@ impl ListWriter {
);
Ok(None)
} else {
Err(anyhow::anyhow!(e))
on_fatal_io_error(&e);
unreachable!();
}
}
}
@@ -228,8 +231,10 @@ impl ListWriter {
deletion_directory.display(),
);
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
// This is fatal: any failure to read this local directory indicates a
// storage problem or configuration problem of the node.
virtual_file::on_fatal_io_error(&e);
return Err(e.into());
}
};
@@ -243,26 +248,27 @@ impl ListWriter {
let file_name = dentry.file_name();
let dentry_str = file_name.to_string_lossy();
if Some(file_name.as_os_str()) == header_path.file_name() {
// Don't try and parse the header's name like a list
continue;
}
// Temporary files might be left behind from `crashsafe_overwrite`
if dentry_str.ends_with(TEMP_SUFFIX) {
info!("Cleaning up temporary file {dentry_str}");
let absolute_path = deletion_directory.join(dentry.file_name());
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
// Non-fatal error: we will just leave the file behind but not
// try and load it.
warn!(
"Failed to clean up temporary file {}: {e:#}",
absolute_path.display()
);
virtual_file::on_fatal_io_error(&e);
}
continue;
}
if Some(file_name.as_os_str()) == header_path.file_name() {
// Don't try and parse the header's name like a list
continue;
}
let file_name = dentry.file_name().to_owned();
let basename = file_name.to_string_lossy();
let seq_part = if let Some(m) = list_name_pattern.captures(&basename) {
@@ -270,7 +276,7 @@ impl ListWriter {
.expect("Non optional group should be present")
.as_str()
} else {
warn!("Unexpected key in deletion queue: {basename}");
warn!("Unexpected filename in deletion queue: {basename}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
continue;
};
@@ -298,7 +304,13 @@ impl ListWriter {
for s in seqs {
let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path).await?;
let list_bytes = match tokio::fs::read(&list_path).await {
Ok(b) => b,
Err(e) => {
virtual_file::on_fatal_io_error(&e);
unreachable!();
}
};
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l,

View File

@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError;
use crate::metrics;
use crate::virtual_file::on_fatal_io_error;
use super::deleter::DeleterMessage;
use super::DeletionHeader;
@@ -116,6 +117,11 @@ where
/// Valid LSN updates propagate back to Timelines immediately, valid DeletionLists
/// go into the queue of ready-to-execute lists.
async fn validate(&mut self) -> Result<(), DeletionQueueError> {
// Figure out for each tenant which generation number to validate.
//
// It is sufficient to validate the max generation number of each tenant because only the
// highest generation number can possibly be valid. Hence this map will collect the
// highest generation pending validation for each tenant.
let mut tenant_generations = HashMap::new();
for list in &self.pending_lists {
for (tenant_id, tenant_list) in &list.tenants {
@@ -246,6 +252,11 @@ where
}
}
// Assert monotonicity of the list sequence numbers we are processing
if let Some(validated) = validated_sequence {
assert!(list.sequence >= validated)
}
validated_sequence = Some(list.sequence);
}
@@ -293,6 +304,9 @@ where
// issue (probably permissions) has been fixed by then.
tracing::error!("Failed to delete {}: {e:#}", list_path.display());
metrics::DELETION_QUEUE.unexpected_errors.inc();
on_fatal_io_error(&e);
break;
}
}

View File

@@ -208,6 +208,15 @@ impl CrashsafeOverwriteError {
}
}
/// Call this when the local filesystem gives us an error with an external
/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
/// bad storage or bad configuration, and we can't fix that from inside
/// a running process.
pub(crate) fn on_fatal_io_error(e: &std::io::Error) {
tracing::error!("Fatal I/O error: {}", &e);
std::process::abort();
}
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {