[BRC-2905] Feed back PS-detected data corruption signals to SK and PG… (#12748)

… walproposer (#895) Data corruptions are typically detected on the pageserver side when it replays WAL records. However, since PS doesn't synchronously replay WAL records as they are being ingested through safekeepers, we need some extra plumbing to feed information about pageserver-detected corruptions during compaction (and/or WAL redo in general) back to SK and PG for proper action. We don't yet know what actions PG/SK should take upon receiving the signal, but we should have the detection and feedback in place. Add an extra `corruption_detected` field to the `PageserverFeedback` message that is sent from PS -> SK -> PG. It's a boolean value that is set to true when PS detects a "critical error" that signals data corruption, and it's sent in all `PageserverFeedback` messages. Upon receiving this signal, the safekeeper raises a `safekeeper_ps_corruption_detected` gauge metric (value set to 1). The safekeeper then forwards this signal to PG where a `ps_corruption_detected` gauge metric (value also set to 1) is raised in the `neon_perf_counters` view. Added an integration test in `test_compaction.py::test_ps_corruption_detection_feedback` that confirms that the safekeeper and PG can receive the data corruption signal in the `PageserverFeedback` message in a simulated data corruption. ## Problem ## Summary of changes --------- Co-authored-by: William Huang <william.huang@databricks.com>
2025-12-27 08:09:58 +00:00 · 2025-07-29 16:40:07 -04:00
parent 7cd0066212
commit 07c3cfd2a0
18 changed files with 208 additions and 1 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -397,6 +397,11 @@ pub struct Timeline {
    /// If true, the last compaction failed.
    compaction_failed: AtomicBool,

+    /// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline.
+    /// We need to feed this information back to the Safekeeper and postgres for them to take the
+    /// appropriate action.
+    corruption_detected: AtomicBool,
+
    /// Notifies the tenant compaction loop that there is pending L0 compaction work.
    l0_compaction_trigger: Arc<Notify>,

@@ -3310,6 +3315,7 @@ impl Timeline {

                compaction_lock: tokio::sync::Mutex::default(),
                compaction_failed: AtomicBool::default(),
+                corruption_detected: AtomicBool::default(),
                l0_compaction_trigger: resources.l0_compaction_trigger,
                gc_lock: tokio::sync::Mutex::default(),

@@ -6004,6 +6010,17 @@ impl Timeline {
                )))
            });

+            // Begin Hadron
+            //
+            fail_point!("create-image-layer-fail-simulated-corruption", |_| {
+                self.corruption_detected
+                    .store(true, std::sync::atomic::Ordering::Relaxed);
+                Err(CreateImageLayersError::Other(anyhow::anyhow!(
+                    "failpoint create-image-layer-fail-simulated-corruption"
+                )))
+            });
+            // End Hadron
+
            let io_concurrency = IoConcurrency::spawn_from_conf(
                self.conf.get_vectored_concurrent_io,
                self.gate
@@ -7149,6 +7166,7 @@ impl Timeline {
                            critical_timeline!(
                                self.tenant_shard_id,
                                self.timeline_id,
+                                Some(&self.corruption_detected),
                                "walredo failure during page reconstruction: {err:?}"
                            );
                        }
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1397,6 +1397,7 @@ impl Timeline {
                            critical_timeline!(
                                self.tenant_shard_id,
                                self.timeline_id,
+                                Some(&self.corruption_detected),
                                "missing key during compaction: {err:?}"
                            );
                        }
@@ -1441,6 +1442,7 @@ impl Timeline {
                critical_timeline!(
                    self.tenant_shard_id,
                    self.timeline_id,
+                    Some(&self.corruption_detected),
                    "could not compact, repartitioning keyspace failed: {e:?}"
                );
            }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -365,6 +365,7 @@ pub(super) async fn handle_walreceiver_connection(
                                critical_timeline!(
                                    timeline.tenant_shard_id,
                                    timeline.timeline_id,
+                                    Some(&timeline.corruption_detected),
                                    "{msg}"
                                );
                                return Err(WalReceiverError::Other(anyhow!(msg)));
@@ -382,6 +383,7 @@ pub(super) async fn handle_walreceiver_connection(
                                        critical_timeline!(
                                            timeline.tenant_shard_id,
                                            timeline.timeline_id,
+                                            Some(&timeline.corruption_detected),
                                            "{msg}"
                                        );
                                        return Err(WalReceiverError::Other(anyhow!(msg)));
@@ -455,6 +457,7 @@ pub(super) async fn handle_walreceiver_connection(
                                critical_timeline!(
                                    timeline.tenant_shard_id,
                                    timeline.timeline_id,
+                                    Some(&timeline.corruption_detected),
                                    "{err:?}"
                                );
                            }
@@ -586,6 +589,9 @@ pub(super) async fn handle_walreceiver_connection(
                remote_consistent_lsn,
                replytime: ts,
                shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
+                corruption_detected: timeline
+                    .corruption_detected
+                    .load(std::sync::atomic::Ordering::Relaxed),
            };

            debug!("neon_status_update {status_update:?}");
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -23,6 +23,7 @@

 use std::backtrace::Backtrace;
 use std::collections::HashMap;
+use std::sync::atomic::AtomicBool;
 use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};

@@ -422,6 +423,8 @@ impl WalIngest {
            critical_timeline!(
                modification.tline.tenant_shard_id,
                modification.tline.timeline_id,
+                // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
+                None::<&AtomicBool>,
                "clear_vm_bits for unknown VM relation {vm_rel}"
            );
            return Ok(());
@@ -431,6 +434,8 @@ impl WalIngest {
                critical_timeline!(
                    modification.tline.tenant_shard_id,
                    modification.tline.timeline_id,
+                    // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
+                    None::<&AtomicBool>,
                    "new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
                );
                new_vm_blk = None;
@@ -441,6 +446,8 @@ impl WalIngest {
                critical_timeline!(
                    modification.tline.tenant_shard_id,
                    modification.tline.timeline_id,
+                    // Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
+                    None::<&AtomicBool>,
                    "old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
                );
                old_vm_blk = None;