Merge remote-tracking branch 'origin/main' into communicator-rewrite

This commit is contained in:
Heikki Linnakangas
2025-07-30 11:19:42 +03:00
22 changed files with 219 additions and 6 deletions

View File

@@ -397,6 +397,11 @@ pub struct Timeline {
/// If true, the last compaction failed.
compaction_failed: AtomicBool,
/// Begin Hadron: If true, the pageserver has likely detected data corruption in the timeline.
/// We need to feed this information back to the Safekeeper and postgres for them to take the
/// appropriate action.
corruption_detected: AtomicBool,
/// Notifies the tenant compaction loop that there is pending L0 compaction work.
l0_compaction_trigger: Arc<Notify>,
@@ -3310,6 +3315,7 @@ impl Timeline {
compaction_lock: tokio::sync::Mutex::default(),
compaction_failed: AtomicBool::default(),
corruption_detected: AtomicBool::default(),
l0_compaction_trigger: resources.l0_compaction_trigger,
gc_lock: tokio::sync::Mutex::default(),
@@ -6004,6 +6010,17 @@ impl Timeline {
)))
});
// Begin Hadron
//
fail_point!("create-image-layer-fail-simulated-corruption", |_| {
self.corruption_detected
.store(true, std::sync::atomic::Ordering::Relaxed);
Err(CreateImageLayersError::Other(anyhow::anyhow!(
"failpoint create-image-layer-fail-simulated-corruption"
)))
});
// End Hadron
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.gate
@@ -7149,6 +7166,7 @@ impl Timeline {
critical_timeline!(
self.tenant_shard_id,
self.timeline_id,
Some(&self.corruption_detected),
"walredo failure during page reconstruction: {err:?}"
);
}

View File

@@ -1397,6 +1397,7 @@ impl Timeline {
critical_timeline!(
self.tenant_shard_id,
self.timeline_id,
Some(&self.corruption_detected),
"missing key during compaction: {err:?}"
);
}
@@ -1441,6 +1442,7 @@ impl Timeline {
critical_timeline!(
self.tenant_shard_id,
self.timeline_id,
Some(&self.corruption_detected),
"could not compact, repartitioning keyspace failed: {e:?}"
);
}

View File

@@ -365,6 +365,7 @@ pub(super) async fn handle_walreceiver_connection(
critical_timeline!(
timeline.tenant_shard_id,
timeline.timeline_id,
Some(&timeline.corruption_detected),
"{msg}"
);
return Err(WalReceiverError::Other(anyhow!(msg)));
@@ -382,6 +383,7 @@ pub(super) async fn handle_walreceiver_connection(
critical_timeline!(
timeline.tenant_shard_id,
timeline.timeline_id,
Some(&timeline.corruption_detected),
"{msg}"
);
return Err(WalReceiverError::Other(anyhow!(msg)));
@@ -455,6 +457,7 @@ pub(super) async fn handle_walreceiver_connection(
critical_timeline!(
timeline.tenant_shard_id,
timeline.timeline_id,
Some(&timeline.corruption_detected),
"{err:?}"
);
}
@@ -586,6 +589,9 @@ pub(super) async fn handle_walreceiver_connection(
remote_consistent_lsn,
replytime: ts,
shard_number: timeline.tenant_shard_id.shard_number.0 as u32,
corruption_detected: timeline
.corruption_detected
.load(std::sync::atomic::Ordering::Relaxed),
};
debug!("neon_status_update {status_update:?}");

View File

@@ -23,6 +23,7 @@
use std::backtrace::Backtrace;
use std::collections::HashMap;
use std::sync::atomic::AtomicBool;
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant, SystemTime};
@@ -422,6 +423,8 @@ impl WalIngest {
critical_timeline!(
modification.tline.tenant_shard_id,
modification.tline.timeline_id,
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
None::<&AtomicBool>,
"clear_vm_bits for unknown VM relation {vm_rel}"
);
return Ok(());
@@ -431,6 +434,8 @@ impl WalIngest {
critical_timeline!(
modification.tline.tenant_shard_id,
modification.tline.timeline_id,
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
None::<&AtomicBool>,
"new_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
);
new_vm_blk = None;
@@ -441,6 +446,8 @@ impl WalIngest {
critical_timeline!(
modification.tline.tenant_shard_id,
modification.tline.timeline_id,
// Hadron: No need to raise the corruption flag here; the caller of `ingest_record()` will do it.
None::<&AtomicBool>,
"old_vm_blk {blknum} not in {vm_rel} of size {vm_size}"
);
old_vm_blk = None;