diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 77b3f90b3e..215682d90c 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -65,7 +65,7 @@ use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; use pageserver::config::PageServerConf; -use pageserver::walredo::PostgresRedoManager; +use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; @@ -223,7 +223,14 @@ impl Request { // TODO: avoid these clones manager - .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .request_redo( + *key, + *lsn, + base_img.clone(), + records.clone(), + *pg_version, + RedoAttemptType::ReadPage, + ) .await .context("request_redo") } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2ac2fd0b81..d3623fc3b9 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -100,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walingest::WalLagCooldown; -use crate::walredo::PostgresRedoManager; +use crate::walredo::{PostgresRedoManager, RedoAttemptType}; use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); @@ -473,15 +473,16 @@ impl WalRedoManager { base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>, pg_version: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { match self { Self::Prod(_, mgr) => { - mgr.request_redo(key, lsn, base_img, records, pg_version) + mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } #[cfg(test)] Self::Test(mgr) => { - mgr.request_redo(key, lsn, base_img, records, pg_version) + mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } } @@ -5879,6 +5880,7 @@ pub(crate) mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, + _redo_attempt_type: RedoAttemptType, ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b10409689c..8a4a6f4b40 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::walredo::RedoAttemptType; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; @@ -1293,6 +1294,12 @@ impl Timeline { }; reconstruct_state.read_path = read_path; + let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction { + RedoAttemptType::LegacyCompaction + } else { + RedoAttemptType::ReadPage + }; + let traversal_res: Result<(), _> = { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { @@ -1380,7 +1387,7 @@ impl Timeline { let walredo_deltas = converted.num_deltas(); let walredo_res = walredo_self - .reconstruct_value(key, lsn, converted) + .reconstruct_value(key, lsn, converted, redo_attempt_type) .maybe_perf_instrument(&ctx, |crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, @@ -6343,37 +6350,21 @@ impl Timeline { /// Reconstruct a value, using the given base image and WAL records in 'data'. async fn reconstruct_value( - &self, - key: Key, - request_lsn: Lsn, - data: ValueReconstructState, - ) -> Result { - self.reconstruct_value_inner(key, request_lsn, data, false) - .await - } - - /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because - /// sometimes it is expected to fail due to unreplayable history described in . - async fn reconstruct_value_wo_critical_error( - &self, - key: Key, - request_lsn: Lsn, - data: ValueReconstructState, - ) -> Result { - self.reconstruct_value_inner(key, request_lsn, data, true) - .await - } - - async fn reconstruct_value_inner( &self, key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - no_critical_error: bool, + redo_attempt_type: RedoAttemptType, ) -> Result { // Perform WAL redo if needed data.records.reverse(); + let fire_critical_error = match redo_attempt_type { + RedoAttemptType::ReadPage => true, + RedoAttemptType::LegacyCompaction => true, + RedoAttemptType::GcCompaction => false, + }; + // If we have a page image, and no WAL, we're all set if data.records.is_empty() { if let Some((img_lsn, img)) = &data.img { @@ -6420,13 +6411,20 @@ impl Timeline { .as_ref() .context("timeline has no walredo manager") .map_err(PageReconstructError::WalRedo)? - .request_redo(key, request_lsn, data.img, data.records, self.pg_version) + .request_redo( + key, + request_lsn, + data.img, + data.records, + self.pg_version, + redo_attempt_type, + ) .await; let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), Err(walredo::Error::Other(err)) => { - if !no_critical_error { + if fire_critical_error { critical!("walredo failure during page reconstruction: {err:?}"); } return Err(PageReconstructError::WalRedo( diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8403c0a7d9..5f969a4e77 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -16,6 +16,8 @@ use super::{ Timeline, }; +use crate::tenant::timeline::DeltaEntry; +use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; @@ -2411,7 +2413,7 @@ impl Timeline { lsn_split_points[i] }; let img = self - .reconstruct_value_wo_critical_error(key, request_lsn, state) + .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction) .await?; Some((request_lsn, img)) } else { @@ -3909,8 +3911,6 @@ impl CompactionLayer for OwnArc { } } -use crate::tenant::timeline::DeltaEntry; - impl CompactionLayer for ResidentDeltaLayer { fn key_range(&self) -> &Range { &self.0.layer_desc().key_range diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 22d8d83811..ed8a954369 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -136,6 +136,16 @@ macro_rules! bail { } } +#[derive(Debug, Clone, Copy)] +pub enum RedoAttemptType { + /// Used for the read path. Will fire critical errors and retry twice if failure. + ReadPage, + // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure. + LegacyCompaction, + // Used for gc compaction. Will not fire critical errors and not retry. + GcCompaction, +} + /// /// Public interface of WAL redo manager /// @@ -156,11 +166,18 @@ impl PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { if records.is_empty() { bail!("invalid WAL redo request with no records"); } + let max_retry_attempts = match redo_attempt_type { + RedoAttemptType::ReadPage => 2, + RedoAttemptType::LegacyCompaction => 1, + RedoAttemptType::GcCompaction => 0, + }; + let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); @@ -180,6 +197,7 @@ impl PostgresRedoManager { &records[batch_start..i], self.conf.wal_redo_timeout, pg_version, + max_retry_attempts, ) .await }; @@ -201,6 +219,7 @@ impl PostgresRedoManager { &records[batch_start..], self.conf.wal_redo_timeout, pg_version, + max_retry_attempts, ) .await } @@ -424,11 +443,11 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, + max_retry_attempts: u32, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); let (rel, blknum) = key.to_rel_block().context("invalid record")?; - const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { let base_img = &base_img; @@ -486,7 +505,7 @@ impl PostgresRedoManager { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; - if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { + if n_attempts > max_retry_attempts || result.is_ok() { return result; } } @@ -560,6 +579,7 @@ mod tests { use super::PostgresRedoManager; use crate::config::PageServerConf; + use crate::walredo::RedoAttemptType; #[tokio::test] async fn test_ping() { @@ -593,6 +613,7 @@ mod tests { None, short_records(), 14, + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await @@ -621,6 +642,7 @@ mod tests { None, short_records(), 14, + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await @@ -642,6 +664,7 @@ mod tests { None, short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await