From 05381a48f05873f2bc64d116720c51b579f97a58 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 12 Nov 2024 18:57:31 +0100 Subject: [PATCH] utils: remove unnecessary fsync in `durable_rename()` (#9686) ## Problem WAL segment fsyncs significantly affect WAL ingestion throughput. `durable_rename()` is used when initializing every 16 MB segment, and issues 3 fsyncs of which 1 was unnecessary. ## Summary of changes Remove an fsync in `durable_rename` which is unnecessary with Linux and ext4 (which we currently use). This improves WAL ingestion throughput by up to 23% with large appends on my MacBook. --- libs/utils/src/crashsafe.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index b97c6c7a45..5241ab183c 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -123,15 +123,27 @@ pub async fn fsync_async_opt( Ok(()) } -/// Like postgres' durable_rename, renames file issuing fsyncs do make it -/// durable. After return, file and rename are guaranteed to be persisted. +/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After +/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the +/// same file system. /// -/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make -/// contents durable; 2) its directory entry to make rename durable 3) again to -/// already renamed file, which is not required by standards but postgres does -/// it, let's stick to that. Postgres additionally fsyncs newpath *before* -/// rename if it exists to ensure that at least one of the files survives, but -/// current callers don't need that. +/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to +/// make the rename durable. This sequence ensures the target file will never be incomplete. +/// +/// Postgres also: +/// +/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing +/// file survives a crash. Current callers don't need this as it should already be fsynced if +/// durability is needed. +/// +/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g. +/// NFS), but not on Linux with most common file systems like ext4 (which we currently use). +/// +/// An audit of 8 other databases found that none fsynced the file after a rename: +/// +/// +/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs: +/// /// /// virtual_file.rs has similar code, but it doesn't use vfs. /// @@ -149,9 +161,6 @@ pub async fn durable_rename( // Time to do the real deal. tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?; - // Postgres'ish fsync of renamed file. - fsync_async_opt(new_path.as_ref(), do_fsync).await?; - // Now fsync the parent let parent = match new_path.as_ref().parent() { Some(p) => p,