feat(pageserver): gc-compaction result verification (#11515)

## Problem

Part of #9114 

There was a debug-mode verification mode that verifies at every
retain_lsn. However, the code was tangled within the actual history
generation itself and it's hard to reason about correctness. This patch
adds a separate post-verification of the gc-compaction result that redos
logs at every retain_lsn and every record above the GC horizon. This
ensures that all key history we produce with gc-compaction is readable,
and if there're read errors after gc-compaction, it can only be
read-path errors instead of gc-compaction bugs.

## Summary of changes

* Add gc_compaction_verification flag, default to true.
* Implement a post-verification process.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z.
2025-04-11 11:50:29 -04:00
committed by GitHub
parent 66f56ddaec
commit 4f7b2cdd4f
7 changed files with 170 additions and 14 deletions

View File

@@ -535,6 +535,11 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_compaction_enabled' as bool")?,
gc_compaction_verification: settings
.remove("gc_compaction_verification")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_compaction_verification' as bool")?,
gc_compaction_initial_threshold_kb: settings
.remove("gc_compaction_initial_threshold_kb")
.map(|x| x.parse::<u64>())

View File

@@ -452,6 +452,8 @@ pub struct TenantConfigToml {
// gc-compaction related configs
/// Enable automatic gc-compaction trigger on this tenant.
pub gc_compaction_enabled: bool,
/// Enable verification of gc-compaction results.
pub gc_compaction_verification: bool,
/// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
/// gc-compaction will be triggered.
pub gc_compaction_initial_threshold_kb: u64,
@@ -692,6 +694,7 @@ pub mod tenant_conf_defaults {
// image layers should be created.
pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
}
@@ -746,6 +749,7 @@ impl Default for TenantConfigToml {
wal_receiver_protocol_override: None,
rel_size_v2_enabled: false,
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
sampling_ratio: None,

View File

@@ -576,6 +576,8 @@ pub struct TenantConfigPatch {
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_enabled: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_verification: FieldPatch<bool>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -696,6 +698,9 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_enabled: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_verification: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub gc_compaction_initial_threshold_kb: Option<u64>,
@@ -744,6 +749,7 @@ impl TenantConfig {
mut wal_receiver_protocol_override,
mut rel_size_v2_enabled,
mut gc_compaction_enabled,
mut gc_compaction_verification,
mut gc_compaction_initial_threshold_kb,
mut gc_compaction_ratio_percent,
mut sampling_ratio,
@@ -835,6 +841,9 @@ impl TenantConfig {
patch
.gc_compaction_enabled
.apply(&mut gc_compaction_enabled);
patch
.gc_compaction_verification
.apply(&mut gc_compaction_verification);
patch
.gc_compaction_initial_threshold_kb
.apply(&mut gc_compaction_initial_threshold_kb);
@@ -876,6 +885,7 @@ impl TenantConfig {
wal_receiver_protocol_override,
rel_size_v2_enabled,
gc_compaction_enabled,
gc_compaction_verification,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
sampling_ratio,
@@ -974,6 +984,9 @@ impl TenantConfig {
gc_compaction_enabled: self
.gc_compaction_enabled
.unwrap_or(global_conf.gc_compaction_enabled),
gc_compaction_verification: self
.gc_compaction_verification
.unwrap_or(global_conf.gc_compaction_verification),
gc_compaction_initial_threshold_kb: self
.gc_compaction_initial_threshold_kb
.unwrap_or(global_conf.gc_compaction_initial_threshold_kb),

View File

@@ -9257,6 +9257,7 @@ mod tests {
&[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
3,
None,
true,
)
.await
.unwrap();
@@ -9381,7 +9382,15 @@ mod tests {
),
];
let res = tline
.generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
.generate_key_retention(
key,
&history,
Lsn(0x60),
&[Lsn(0x40), Lsn(0x50)],
3,
None,
true,
)
.await
.unwrap();
let expected_res = KeyHistoryRetention {
@@ -9460,6 +9469,7 @@ mod tests {
&[],
3,
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
true,
)
.await
.unwrap();
@@ -9508,6 +9518,7 @@ mod tests {
&[Lsn(0x30)],
3,
Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
true,
)
.await
.unwrap();

View File

@@ -2723,6 +2723,10 @@ impl Timeline {
.tenant_conf
.gc_compaction_enabled
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
let gc_compaction_verification = tenant_conf
.tenant_conf
.gc_compaction_verification
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification);
let gc_compaction_initial_threshold_kb = tenant_conf
.tenant_conf
.gc_compaction_initial_threshold_kb
@@ -2737,6 +2741,7 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
GcCompactionCombinedSettings {
gc_compaction_enabled,
gc_compaction_verification,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
}

View File

@@ -80,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId {
pub struct GcCompactionCombinedSettings {
pub gc_compaction_enabled: bool,
pub gc_compaction_verification: bool,
pub gc_compaction_initial_threshold_kb: u64,
pub gc_compaction_ratio_percent: u64,
}
@@ -225,6 +226,7 @@ impl GcCompactionQueue {
gc_compaction_enabled,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
..
} = timeline.get_gc_compaction_settings();
if !gc_compaction_enabled {
return Ok(());
@@ -788,6 +790,114 @@ impl KeyHistoryRetention {
}
Ok(())
}
/// Verify if every key in the retention is readable by replaying the logs.
async fn verify(
&self,
key: Key,
base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
full_history: &[(Key, Lsn, Value)],
tline: &Arc<Timeline>,
) -> anyhow::Result<()> {
// Usually the min_lsn should be the first record but we do a full iteration to be safe.
let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
// This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
return Ok(());
};
let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
// This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
return Ok(());
};
let mut base_img = base_img_from_ancestor
.as_ref()
.map(|(_, lsn, img)| (*lsn, img));
let mut history = Vec::new();
async fn collect_and_verify(
key: Key,
lsn: Lsn,
base_img: &Option<(Lsn, &Bytes)>,
history: &[(Lsn, &NeonWalRecord)],
tline: &Arc<Timeline>,
) -> anyhow::Result<()> {
let mut records = history
.iter()
.map(|(lsn, val)| (*lsn, (*val).clone()))
.collect::<Vec<_>>();
// WAL redo requires records in the reverse LSN order
records.reverse();
let data = ValueReconstructState {
img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
records,
};
tline
.reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
.await
.with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
Ok(())
}
for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
for (lsn, val) in logs {
match val {
Value::Image(img) => {
base_img = Some((*lsn, img));
history.clear();
}
Value::WalRecord(rec) if val.will_init() => {
base_img = None;
history.clear();
history.push((*lsn, rec));
}
Value::WalRecord(rec) => {
history.push((*lsn, rec));
}
}
}
if *retain_lsn >= min_lsn {
// Only verify after the key appears in the full history for the first time.
if base_img.is_none() && history.is_empty() {
anyhow::bail!(
"verificatoin failed: key {} has no history at {}",
key,
retain_lsn
);
};
// We don't modify history: in theory, we could replace the history with a single
// image as in `generate_key_retention` to make redos at later LSNs faster. But we
// want to verify everything as if they are read from the real layer map.
collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?;
}
}
for (lsn, val) in &self.above_horizon.0 {
match val {
Value::Image(img) => {
// Above the GC horizon, we verify every time we see an image.
collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
base_img = Some((*lsn, img));
history.clear();
}
Value::WalRecord(rec) if val.will_init() => {
// Above the GC horizon, we verify every time we see an init record.
collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
base_img = None;
history.clear();
history.push((*lsn, rec));
}
Value::WalRecord(rec) => {
history.push((*lsn, rec));
}
}
}
// Ensure the latest record is readable.
collect_and_verify(key, max_lsn, &base_img, &history, tline).await?;
Ok(())
}
}
#[derive(Debug, Serialize, Default)]
@@ -2210,6 +2320,7 @@ impl Timeline {
/// ```
///
/// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
#[allow(clippy::too_many_arguments)]
pub(crate) async fn generate_key_retention(
self: &Arc<Timeline>,
key: Key,
@@ -2218,6 +2329,7 @@ impl Timeline {
retain_lsn_below_horizon: &[Lsn],
delta_threshold_cnt: usize,
base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
verification: bool,
) -> anyhow::Result<KeyHistoryRetention> {
// Pre-checks for the invariants
@@ -2304,8 +2416,8 @@ impl Timeline {
"should have at least below + above horizon batches"
);
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
if let Some((key, lsn, img)) = base_img_from_ancestor {
replay_history.push((key, lsn, Value::Image(img)));
if let Some((key, lsn, ref img)) = base_img_from_ancestor {
replay_history.push((key, lsn, Value::Image(img.clone())));
}
/// Generate debug information for the replay history
@@ -2419,22 +2531,15 @@ impl Timeline {
// Whether to reconstruct the image. In debug mode, we will generate an image
// at every retain_lsn to ensure data is not corrupted, but we won't put the
// image into the final layer.
let generate_image = produce_image || debug_mode;
if produce_image {
let img_and_lsn = if produce_image {
records_since_last_image = 0;
}
let img_and_lsn = if generate_image {
let replay_history_for_debug = if debug_mode {
Some(replay_history.clone())
} else {
None
};
let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
let history = if produce_image {
std::mem::take(&mut replay_history)
} else {
replay_history.clone()
};
let history = std::mem::take(&mut replay_history);
let mut img = None;
let mut records = Vec::with_capacity(history.len());
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2469,6 +2574,7 @@ impl Timeline {
records.push((lsn, rec));
}
}
// WAL redo requires records in the reverse LSN order
records.reverse();
let state = ValueReconstructState { img, records };
// last batch does not generate image so i is always in range, unless we force generate
@@ -2501,10 +2607,16 @@ impl Timeline {
assert_eq!(retention.len(), lsn_split_points.len() + 1);
for (idx, logs) in retention.into_iter().enumerate() {
if idx == lsn_split_points.len() {
return Ok(KeyHistoryRetention {
let retention = KeyHistoryRetention {
below_horizon: result,
above_horizon: KeyLogAtLsn(logs),
});
};
if verification {
retention
.verify(key, &base_img_from_ancestor, full_history, self)
.await?;
}
return Ok(retention);
} else {
result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
}
@@ -2971,6 +3083,9 @@ impl Timeline {
}
(false, res)
};
let verification = self.get_gc_compaction_settings().gc_compaction_verification;
info!(
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
job_desc.selected_layers.len(),
@@ -3287,6 +3402,7 @@ impl Timeline {
.await
.context("failed to get ancestor image")
.map_err(CompactionError::Other)?,
verification,
)
.await
.context("failed to generate key retention")
@@ -3327,6 +3443,7 @@ impl Timeline {
.await
.context("failed to get ancestor image")
.map_err(CompactionError::Other)?,
verification,
)
.await
.context("failed to generate key retention")

View File

@@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
},
"rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it
"gc_compaction_enabled": True,
"gc_compaction_verification": False,
"gc_compaction_initial_threshold_kb": 1024000,
"gc_compaction_ratio_percent": 200,
"image_creation_preempt_threshold": 5,