From c861d71eeb6d3acfc4c99ced41dd0df778cda802 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 4 Mar 2024 13:18:22 +0100 Subject: [PATCH] layer file creation: fatal_err on timeline dir fsync (#6985) As pointed out in the comments added in this PR: the in-memory state of the filesystem already has the layer file in its final place. If the fsync fails, but pageserver continues to execute, it's quite easy for subsequent pageserver code to observe the file being there and assume it's durable, when it really isn't. It can happen that we get ENOSPC during the fsync. However, 1. the timeline dir is small (remember, the big layer _file_ has already been synced). Small data means ENOSPC due to delayed allocation races etc are less likely. 2. what else are we going to do in that case? If we decide to bubble up the error, the file remains on disk. We could try to unlink it and fsync after the unlink. If that fails, we would _definitely_ need to error out. Is it worth the trouble though? Side note: all this logic about not carrying on after fsync failure implies that we `sync` the filesystem successfully before we restart the pageserver. We don't do that right now, but should (=> https://github.com/neondatabase/neon/issues/6989) part of https://github.com/neondatabase/neon/issues/6663 --- pageserver/src/tenant/timeline.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0c03ef33c3..0a2ae5d8bd 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -50,7 +50,6 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::sync::gate::{Gate, GateGuard}; -use crate::pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}; use crate::tenant::timeline::logical_size::CurrentLogicalSize; use crate::tenant::{ layer_map::{LayerMap, SearchResult}, @@ -75,6 +74,10 @@ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{ + pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, + virtual_file::MaybeFatalIo, +}; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; @@ -3426,10 +3429,14 @@ impl Timeline { // The write_to_disk() above calls writer.finish() which already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. + // + // We use fatal_err() below because the after write_to_disk returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. par_fsync::par_fsync(&[self_clone .conf .timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)]) - .context("fsync of timeline dir")?; + .fatal_err("fsync of timeline dir"); anyhow::Ok(new_delta) } @@ -3662,11 +3669,14 @@ impl Timeline { // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. if !image_layers.is_empty() { + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. par_fsync::par_fsync_async(&[self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id)]) .await - .context("fsync of timeline dir")?; + .fatal_err("fsync of timeline dir"); } let mut guard = self.layers.write().await; @@ -4251,12 +4261,16 @@ impl Timeline { // The writer.finish() above already did the fsync of the inodes. // We just need to fsync the directory in which these inodes are linked, // which we know to be the timeline directory. + // + // We use fatal_err() below because the after writer.finish() returns with success, + // the in-memory state of the filesystem already has the layer file in its final place, + // and subsequent pageserver code could think it's durable while it really isn't. let timeline_dir = self .conf .timeline_path(&self.tenant_shard_id, &self.timeline_id); par_fsync::par_fsync_async(&[timeline_dir]) .await - .context("fsync of timeline dir")?; + .fatal_err("fsync of timeline dir"); } stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();