mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-03 19:42:55 +00:00
fix(pageserver): abort process if fsync fails (#9108)
close https://github.com/neondatabase/neon/issues/8140 The original issue is rather vague on what we should do. After discussion w/ @problame we decided to narrow down the problems we want to solve in that issue. * read path -- do not panic for now. * write path -- panic only on write errors (i.e., device error, fsync error), but not on no-space for now. The guideline is that if the pageserver behavior could lead to violation of persistent constraints (i.e., return an operation as successful but not actually persisting things), we should panic. Fsync is the place where both of us agree that we should panic, because if fsync fails, the kernel will mark dirty pages as clean, and the next fsync will not necessarily return false. This would make the storage client assume the operation is successful. ## Summary of changes Make fsync panic on fatal errors. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
@@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
+ TEMP_FILE_SUFFIX;
|
||||
let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
|
||||
fs::rename(path.as_ref(), &tmp_path).await?;
|
||||
fs::File::open(parent).await?.sync_all().await?;
|
||||
fs::File::open(parent)
|
||||
.await?
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("safe_rename_tenant_dir")?;
|
||||
Ok(tmp_path)
|
||||
}
|
||||
|
||||
|
||||
@@ -178,6 +178,7 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("flush source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -185,6 +186,7 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
@@ -232,6 +234,7 @@ async fn download_object<'a>(
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("download_object sync_all")
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ use crate::tenant::vectored_blob_io::{
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
@@ -589,7 +589,9 @@ impl DeltaLayerWriterInner {
|
||||
);
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("delta_layer sync_all")?;
|
||||
|
||||
trace!("created delta layer {}", self.path);
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ use crate::tenant::vectored_blob_io::{
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
@@ -889,7 +889,9 @@ impl ImageLayerWriterInner {
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
|
||||
// fsync the file
|
||||
file.sync_all().await?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.maybe_fatal_err("image_layer sync_all")?;
|
||||
|
||||
trace!("created image layer {}", self.path);
|
||||
|
||||
|
||||
@@ -466,6 +466,7 @@ impl VirtualFile {
|
||||
&[]
|
||||
};
|
||||
utils::crashsafe::overwrite(&final_path, &tmp_path, content)
|
||||
.maybe_fatal_err("crashsafe_overwrite")
|
||||
})
|
||||
.await
|
||||
.expect("blocking task is never aborted")
|
||||
@@ -475,7 +476,7 @@ impl VirtualFile {
|
||||
pub async fn sync_all(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
|
||||
let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
|
||||
res
|
||||
res.maybe_fatal_err("sync_all")
|
||||
})
|
||||
}
|
||||
|
||||
@@ -483,7 +484,7 @@ impl VirtualFile {
|
||||
pub async fn sync_data(&self) -> Result<(), Error> {
|
||||
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
|
||||
let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
|
||||
res
|
||||
res.maybe_fatal_err("sync_data")
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user