From 63ca084696f4dd226bfea1abae66dcb3234d1051 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 11 Jul 2025 14:37:55 -0400 Subject: [PATCH] fix(pageserver): downgrade wal apply error during gc-compaction (#12518) ## Problem close LKB-162 close https://github.com/neondatabase/cloud/issues/30665, related to https://github.com/neondatabase/cloud/issues/29434 We see a lot of errors like: ``` 2025-05-22T23:06:14.928959Z ERROR compaction_loop{tenant_id=? shard_id=0304}:run:gc_compact_timeline{timeline_id=?}: error applying 4 WAL records 35/DC0DF0B8..3B/E43188C0 (8119 bytes) to key 000000067F0000400500006027000000B9D0, from base image with LSN 0/0 to reconstruct page image at LSN 61/150B9B20 n_attempts=0: apply_wal_records Caused by: 0: read walredo stdout 1: early eof ``` which is an acceptable form of error and we should downgrade it to warning. ## Summary of changes walredo error during gc-compaction is expected when the data below the gc horizon does not contain a full key history. This is possible in some rare cases of gc that is only able to remove data in the middle of the history but not all earlier history when a full keyspace gets deleted. Signed-off-by: Alex Chi Z --- pageserver/src/walredo.rs | 46 +++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index c6d3cafe9a..f053c9ed37 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -147,6 +147,16 @@ pub enum RedoAttemptType { GcCompaction, } +impl std::fmt::Display for RedoAttemptType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RedoAttemptType::ReadPage => write!(f, "read page"), + RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"), + RedoAttemptType::GcCompaction => write!(f, "gc compaction"), + } + } +} + /// /// Public interface of WAL redo manager /// @@ -199,6 +209,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, max_retry_attempts, + redo_attempt_type, ) .await }; @@ -221,6 +232,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, max_retry_attempts, + redo_attempt_type, ) .await } @@ -445,6 +457,7 @@ impl PostgresRedoManager { wal_redo_timeout: Duration, pg_version: PgMajorVersion, max_retry_attempts: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); @@ -485,17 +498,28 @@ impl PostgresRedoManager { ); if let Err(e) = result.as_ref() { - error!( - "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", - records.len(), - records.first().map(|p| p.0).unwrap_or(Lsn(0)), - records.last().map(|p| p.0).unwrap_or(Lsn(0)), - nbytes, - base_img_lsn, - lsn, - n_attempts, - e, - ); + macro_rules! message { + ($level:tt) => { + $level!( + "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}", + records.len(), + records.first().map(|p| p.0).unwrap_or(Lsn(0)), + records.last().map(|p| p.0).unwrap_or(Lsn(0)), + nbytes, + key, + redo_attempt_type, + base_img_lsn, + lsn, + n_attempts, + e, + ) + } + } + match redo_attempt_type { + RedoAttemptType::ReadPage => message!(error), + RedoAttemptType::LegacyCompaction => message!(error), + RedoAttemptType::GcCompaction => message!(warn), + } } result.map_err(Error::Other)