diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index fe5114a247..c75f940386 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -46,6 +46,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; pub const LOG_FILE_NAME: &str = "pageserver.log"; +static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); + /// Config for the Repository checkpointer #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 424ce4769a..ca931ed37d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1373,6 +1373,17 @@ fn is_rel_block_key(key: Key) -> bool { key.field1 == 0x00 && key.field4 != 0 } +pub fn is_rel_fsm_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff +} + +pub fn is_rel_vm_block_key(key: Key) -> bool { + key.field1 == 0x00 + && key.field4 != 0 + && key.field5 == VISIBILITYMAP_FORKNUM + && key.field6 != 0xffffffff +} + pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a771f82caf..d6ce644bb5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -34,6 +34,7 @@ use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; @@ -52,6 +53,7 @@ use crate::task_mgr::TaskKind; use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; +use crate::ZERO_PAGE; use crate::{ page_cache, storage_sync::{self, index::LayerFileMetadata}, @@ -1496,7 +1498,32 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = self.get(key, lsn)?; + let img = match self.get(key, lsn) { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { + warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(err); + } + } + }; image_layer_writer.put_image(key, &img)?; key = key.next(); } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index d3d2c6d9b2..9a6b99d991 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,6 +34,7 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use crate::ZERO_PAGE; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; @@ -43,8 +44,6 @@ use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - pub struct WalIngest<'a> { timeline: &'a Timeline,