From 6b19867410a92084a448d5058ca4329eafb01be8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 12 Nov 2024 16:17:03 +0100 Subject: [PATCH] safekeeper: don't flush control file on WAL ingest path (#9698) ## Problem The control file is flushed on the WAL ingest path when the commit LSN advances by one segment, to bound the amount of recovery work in case of a crash. This involves 3 additional fsyncs, which can have a significant impact on WAL ingest throughput. This is to some extent mitigated by `AppendResponse` not being emitted on segment bound flushes, since this will prevent commit LSN advancement, which will be addressed separately. ## Summary of changes Don't flush the control file on the WAL ingest path at all. Instead, leave that responsibility to the timeline manager, but ask it to flush eagerly if the control file lags the in-memory commit LSN by more than one segment. This should not cause more than `REFRESH_INTERVAL` (300 ms) additional latency before flushing the control file, which is negligible. --- libs/utils/src/lsn.rs | 5 +++++ safekeeper/src/safekeeper.rs | 12 ++---------- safekeeper/src/timeline_manager.rs | 7 ++++++- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs index 524f3604a1..f188165600 100644 --- a/libs/utils/src/lsn.rs +++ b/libs/utils/src/lsn.rs @@ -138,6 +138,11 @@ impl Lsn { self.0.checked_sub(other).map(Lsn) } + /// Subtract a number, saturating at numeric bounds instead of overflowing. + pub fn saturating_sub>(self, other: T) -> Lsn { + Lsn(self.0.saturating_sub(other.into())) + } + /// Subtract a number, returning the difference as i128 to avoid overflow. pub fn widening_sub>(self, other: T) -> i128 { let other: u64 = other.into(); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index cf41d7a0ab..f4983d44d0 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -979,7 +979,8 @@ where self.wal_store.flush_wal().await?; } - // Update commit_lsn. + // Update commit_lsn. It will be flushed to the control file regularly by the timeline + // manager, off of the WAL ingest hot path. if msg.h.commit_lsn != Lsn(0) { self.update_commit_lsn(msg.h.commit_lsn).await?; } @@ -992,15 +993,6 @@ where self.state.inmem.peer_horizon_lsn = max(self.state.inmem.peer_horizon_lsn, msg.h.truncate_lsn); - // Update truncate and commit LSN in control file. - // To avoid negative impact on performance of extra fsync, do it only - // when commit_lsn delta exceeds WAL segment size. - if self.state.commit_lsn + (self.state.server.wal_seg_size as u64) - < self.state.inmem.commit_lsn - { - self.state.flush().await?; - } - trace!( "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", msg.wal_data.len(), diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 79200fff8d..e9fed21bf5 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -515,7 +515,12 @@ impl Manager { return; } - if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval { + if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval + // If the control file's commit_lsn lags more than one segment behind the current + // commit_lsn, flush immediately to limit recovery time in case of a crash. We don't do + // this on the WAL ingest hot path since it incurs fsync latency. + || state.commit_lsn.saturating_sub(state.cfile_commit_lsn).0 >= self.wal_seg_size as u64 + { let mut write_guard = self.tli.write_shared_state().await; // it should be done in the background because it blocks manager task, but flush() should // be fast enough not to be a problem now