diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 591eb3728b..5c985e6dc8 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -535,6 +535,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_enabled' as bool")?, + gc_compaction_verification: settings + .remove("gc_compaction_verification") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_verification' as bool")?, gc_compaction_initial_threshold_kb: settings .remove("gc_compaction_initial_threshold_kb") .map(|x| x.parse::()) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index bd9f7efb7f..3820022011 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -452,6 +452,8 @@ pub struct TenantConfigToml { // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. pub gc_compaction_enabled: bool, + /// Enable verification of gc-compaction results. + pub gc_compaction_verification: bool, /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, /// gc-compaction will be triggered. pub gc_compaction_initial_threshold_kb: u64, @@ -692,6 +694,7 @@ pub mod tenant_conf_defaults { // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } @@ -746,6 +749,7 @@ impl Default for TenantConfigToml { wal_receiver_protocol_override: None, rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, + gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, sampling_ratio: None, diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 34a419f2cf..f491ed10e1 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -576,6 +576,8 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_verification: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_initial_threshold_kb: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_ratio_percent: FieldPatch, @@ -696,6 +698,9 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_enabled: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_verification: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_initial_threshold_kb: Option, @@ -744,6 +749,7 @@ impl TenantConfig { mut wal_receiver_protocol_override, mut rel_size_v2_enabled, mut gc_compaction_enabled, + mut gc_compaction_verification, mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, mut sampling_ratio, @@ -835,6 +841,9 @@ impl TenantConfig { patch .gc_compaction_enabled .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_verification + .apply(&mut gc_compaction_verification); patch .gc_compaction_initial_threshold_kb .apply(&mut gc_compaction_initial_threshold_kb); @@ -876,6 +885,7 @@ impl TenantConfig { wal_receiver_protocol_override, rel_size_v2_enabled, gc_compaction_enabled, + gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, sampling_ratio, @@ -974,6 +984,9 @@ impl TenantConfig { gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_verification: self + .gc_compaction_verification + .unwrap_or(global_conf.gc_compaction_verification), gc_compaction_initial_threshold_kb: self .gc_compaction_initial_threshold_kb .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ad4a0d804d..4e3d849032 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -9257,6 +9257,7 @@ mod tests { &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], 3, None, + true, ) .await .unwrap(); @@ -9381,7 +9382,15 @@ mod tests { ), ]; let res = tline - .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None) + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x40), Lsn(0x50)], + 3, + None, + true, + ) .await .unwrap(); let expected_res = KeyHistoryRetention { @@ -9460,6 +9469,7 @@ mod tests { &[], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + true, ) .await .unwrap(); @@ -9508,6 +9518,7 @@ mod tests { &[Lsn(0x30)], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + true, ) .await .unwrap(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8a4a6f4b40..67a16db040 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2723,6 +2723,10 @@ impl Timeline { .tenant_conf .gc_compaction_enabled .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_verification = tenant_conf + .tenant_conf + .gc_compaction_verification + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification); let gc_compaction_initial_threshold_kb = tenant_conf .tenant_conf .gc_compaction_initial_threshold_kb @@ -2737,6 +2741,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); GcCompactionCombinedSettings { gc_compaction_enabled, + gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a559c7fdec..e3aa5045bb 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -80,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId { pub struct GcCompactionCombinedSettings { pub gc_compaction_enabled: bool, + pub gc_compaction_verification: bool, pub gc_compaction_initial_threshold_kb: u64, pub gc_compaction_ratio_percent: u64, } @@ -225,6 +226,7 @@ impl GcCompactionQueue { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, + .. } = timeline.get_gc_compaction_settings(); if !gc_compaction_enabled { return Ok(()); @@ -788,6 +790,114 @@ impl KeyHistoryRetention { } Ok(()) } + + /// Verify if every key in the retention is readable by replaying the logs. + async fn verify( + &self, + key: Key, + base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>, + full_history: &[(Key, Lsn, Value)], + tline: &Arc, + ) -> anyhow::Result<()> { + // Usually the min_lsn should be the first record but we do a full iteration to be safe. + let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else { + // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. + return Ok(()); + }; + let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else { + // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. + return Ok(()); + }; + let mut base_img = base_img_from_ancestor + .as_ref() + .map(|(_, lsn, img)| (*lsn, img)); + let mut history = Vec::new(); + + async fn collect_and_verify( + key: Key, + lsn: Lsn, + base_img: &Option<(Lsn, &Bytes)>, + history: &[(Lsn, &NeonWalRecord)], + tline: &Arc, + ) -> anyhow::Result<()> { + let mut records = history + .iter() + .map(|(lsn, val)| (*lsn, (*val).clone())) + .collect::>(); + + // WAL redo requires records in the reverse LSN order + records.reverse(); + let data = ValueReconstructState { + img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())), + records, + }; + + tline + .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction) + .await + .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?; + + Ok(()) + } + + for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon { + for (lsn, val) in logs { + match val { + Value::Image(img) => { + base_img = Some((*lsn, img)); + history.clear(); + } + Value::WalRecord(rec) if val.will_init() => { + base_img = None; + history.clear(); + history.push((*lsn, rec)); + } + Value::WalRecord(rec) => { + history.push((*lsn, rec)); + } + } + } + if *retain_lsn >= min_lsn { + // Only verify after the key appears in the full history for the first time. + + if base_img.is_none() && history.is_empty() { + anyhow::bail!( + "verificatoin failed: key {} has no history at {}", + key, + retain_lsn + ); + }; + // We don't modify history: in theory, we could replace the history with a single + // image as in `generate_key_retention` to make redos at later LSNs faster. But we + // want to verify everything as if they are read from the real layer map. + collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?; + } + } + + for (lsn, val) in &self.above_horizon.0 { + match val { + Value::Image(img) => { + // Above the GC horizon, we verify every time we see an image. + collect_and_verify(key, *lsn, &base_img, &history, tline).await?; + base_img = Some((*lsn, img)); + history.clear(); + } + Value::WalRecord(rec) if val.will_init() => { + // Above the GC horizon, we verify every time we see an init record. + collect_and_verify(key, *lsn, &base_img, &history, tline).await?; + base_img = None; + history.clear(); + history.push((*lsn, rec)); + } + Value::WalRecord(rec) => { + history.push((*lsn, rec)); + } + } + } + // Ensure the latest record is readable. + collect_and_verify(key, max_lsn, &base_img, &history, tline).await?; + Ok(()) + } } #[derive(Debug, Serialize, Default)] @@ -2210,6 +2320,7 @@ impl Timeline { /// ``` /// /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + #[allow(clippy::too_many_arguments)] pub(crate) async fn generate_key_retention( self: &Arc, key: Key, @@ -2218,6 +2329,7 @@ impl Timeline { retain_lsn_below_horizon: &[Lsn], delta_threshold_cnt: usize, base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, + verification: bool, ) -> anyhow::Result { // Pre-checks for the invariants @@ -2304,8 +2416,8 @@ impl Timeline { "should have at least below + above horizon batches" ); let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); - if let Some((key, lsn, img)) = base_img_from_ancestor { - replay_history.push((key, lsn, Value::Image(img))); + if let Some((key, lsn, ref img)) = base_img_from_ancestor { + replay_history.push((key, lsn, Value::Image(img.clone()))); } /// Generate debug information for the replay history @@ -2419,22 +2531,15 @@ impl Timeline { // Whether to reconstruct the image. In debug mode, we will generate an image // at every retain_lsn to ensure data is not corrupted, but we won't put the // image into the final layer. - let generate_image = produce_image || debug_mode; - if produce_image { + let img_and_lsn = if produce_image { records_since_last_image = 0; - } - let img_and_lsn = if generate_image { let replay_history_for_debug = if debug_mode { Some(replay_history.clone()) } else { None }; let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); - let history = if produce_image { - std::mem::take(&mut replay_history) - } else { - replay_history.clone() - }; + let history = std::mem::take(&mut replay_history); let mut img = None; let mut records = Vec::with_capacity(history.len()); if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { @@ -2469,6 +2574,7 @@ impl Timeline { records.push((lsn, rec)); } } + // WAL redo requires records in the reverse LSN order records.reverse(); let state = ValueReconstructState { img, records }; // last batch does not generate image so i is always in range, unless we force generate @@ -2501,10 +2607,16 @@ impl Timeline { assert_eq!(retention.len(), lsn_split_points.len() + 1); for (idx, logs) in retention.into_iter().enumerate() { if idx == lsn_split_points.len() { - return Ok(KeyHistoryRetention { + let retention = KeyHistoryRetention { below_horizon: result, above_horizon: KeyLogAtLsn(logs), - }); + }; + if verification { + retention + .verify(key, &base_img_from_ancestor, full_history, self) + .await?; + } + return Ok(retention); } else { result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); } @@ -2971,6 +3083,9 @@ impl Timeline { } (false, res) }; + + let verification = self.get_gc_compaction_settings().gc_compaction_verification; + info!( "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}", job_desc.selected_layers.len(), @@ -3287,6 +3402,7 @@ impl Timeline { .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, + verification, ) .await .context("failed to generate key retention") @@ -3327,6 +3443,7 @@ impl Timeline { .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, + verification, ) .await .context("failed to generate key retention") diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 5021cc4b17..9b6930695c 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it "gc_compaction_enabled": True, + "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, "image_creation_preempt_threshold": 5,