From 63ca084696f4dd226bfea1abae66dcb3234d1051 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 11 Jul 2025 14:37:55 -0400
Subject: [PATCH] fix(pageserver): downgrade wal apply error during
 gc-compaction (#12518)

## Problem

close LKB-162

close https://github.com/neondatabase/cloud/issues/30665, related to
https://github.com/neondatabase/cloud/issues/29434

We see a lot of errors like:

```
2025-05-22T23:06:14.928959Z ERROR compaction_loop{tenant_id=? shard_id=0304}:run:gc_compact_timeline{timeline_id=?}: error applying 4 WAL records 35/DC0DF0B8..3B/E43188C0 (8119 bytes) to key 000000067F0000400500006027000000B9D0, from base image with LSN 0/0 to reconstruct page image at LSN 61/150B9B20 n_attempts=0: apply_wal_records

Caused by:
    0: read walredo stdout
    1: early eof
```

which is an acceptable form of error and we should downgrade it to
warning.

## Summary of changes

walredo error during gc-compaction is expected when the data below the
gc horizon does not contain a full key history. This is possible in some
rare cases of gc that is only able to remove data in the middle of the
history but not all earlier history when a full keyspace gets deleted.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/walredo.rs | 46 +++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c6d3cafe9a..f053c9ed37 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -147,6 +147,16 @@ pub enum RedoAttemptType {
     GcCompaction,
 }
 
+impl std::fmt::Display for RedoAttemptType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RedoAttemptType::ReadPage => write!(f, "read page"),
+            RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"),
+            RedoAttemptType::GcCompaction => write!(f, "gc compaction"),
+        }
+    }
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -199,6 +209,7 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                         max_retry_attempts,
+                        redo_attempt_type,
                     )
                     .await
                 };
@@ -221,6 +232,7 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
                 max_retry_attempts,
+                redo_attempt_type,
             )
             .await
         }
@@ -445,6 +457,7 @@ impl PostgresRedoManager {
         wal_redo_timeout: Duration,
         pg_version: PgMajorVersion,
         max_retry_attempts: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
@@ -485,17 +498,28 @@ impl PostgresRedoManager {
                 );
 
                 if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
-                    );
+                    macro_rules! message {
+                        ($level:tt) => {
+                            $level!(
+                                "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                                records.len(),
+                                records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                                records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                                nbytes,
+                                key,
+                                redo_attempt_type,
+                                base_img_lsn,
+                                lsn,
+                                n_attempts,
+                                e,
+                            )
+                        }
+                    }
+                    match redo_attempt_type {
+                        RedoAttemptType::ReadPage => message!(error),
+                        RedoAttemptType::LegacyCompaction => message!(error),
+                        RedoAttemptType::GcCompaction => message!(warn),
+                    }
                 }
 
                 result.map_err(Error::Other)