From eb1bdcc6cfb72293019186021f40297affc6dc33 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 20 Oct 2022 13:21:36 +0300 Subject: [PATCH] If an FSM or VM page cannot be reconstructed, fill it with zeros. If we cannot reconstruct an FSM or VM page, while creating image layers, fill it with zeros instead. That should always be safe, for the FSM and VM, in the sense that you won't lose actual user data. It will get cleaned up by VACUUM later. We had a bug with FSM/VM truncation, where we truncated the FSM and VM at WAL replay to a smaller size than PostgreSQL originally did. We thought was harmless, as the FSM and VM are not critical for correctness and can be zeroed out or truncated without affecting user data. However, it lead to a situation where PostgreSQL created incremental WAL records for pages that we had already truncated away in the pageserver, and when we tried to replay those WAL records, that failed. That lead to a permanent error in image layer creation, and prevented it from ever finishing. See https://github.com/neondatabase/neon/issues/2601. With this patch, those pages will be filled with zeros in the image layer, which allows the image layer creation to finish. --- pageserver/src/lib.rs | 2 ++ pageserver/src/pgdatadir_mapping.rs | 11 +++++++++++ pageserver/src/tenant/timeline.rs | 29 ++++++++++++++++++++++++++++- pageserver/src/walingest.rs | 3 +-- 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index fe5114a247..c75f940386 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -46,6 +46,8 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61; pub const LOG_FILE_NAME: &str = "pageserver.log"; +static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); + /// Config for the Repository checkpointer #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 424ce4769a..ca931ed37d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1373,6 +1373,17 @@ fn is_rel_block_key(key: Key) -> bool { key.field1 == 0x00 && key.field4 != 0 } +pub fn is_rel_fsm_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff +} + +pub fn is_rel_vm_block_key(key: Key) -> bool { + key.field1 == 0x00 + && key.field4 != 0 + && key.field5 == VISIBILITYMAP_FORKNUM + && key.field6 != 0xffffffff +} + pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { Ok(match key.field1 { 0x01 => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index a771f82caf..d6ce644bb5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -34,6 +34,7 @@ use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::metrics::TimelineMetrics; use crate::pgdatadir_mapping::BlockNumber; use crate::pgdatadir_mapping::LsnForTimestamp; +use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::reltag::RelTag; use crate::tenant_config::TenantConfOpt; @@ -52,6 +53,7 @@ use crate::task_mgr::TaskKind; use crate::walreceiver::{is_etcd_client_initialized, spawn_connection_manager_task}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; +use crate::ZERO_PAGE; use crate::{ page_cache, storage_sync::{self, index::LayerFileMetadata}, @@ -1496,7 +1498,32 @@ impl Timeline { for range in &partition.ranges { let mut key = range.start; while key < range.end { - let img = self.get(key, lsn)?; + let img = match self.get(key, lsn) { + Ok(img) => img, + Err(err) => { + // If we fail to reconstruct a VM or FSM page, we can zero the + // page without losing any actual user data. That seems better + // than failing repeatedly and getting stuck. + // + // We had a bug at one point, where we truncated the FSM and VM + // in the pageserver, but the Postgres didn't know about that + // and continued to generate incremental WAL records for pages + // that didn't exist in the pageserver. Trying to replay those + // WAL records failed to find the previous image of the page. + // This special case allows us to recover from that situation. + // See https://github.com/neondatabase/neon/issues/2601. + // + // Unfortunately we cannot do this for the main fork, or for + // any metadata keys, keys, as that would lead to actual data + // loss. + if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) { + warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}"); + ZERO_PAGE.clone() + } else { + return Err(err); + } + } + }; image_layer_writer.put_image(key, &img)?; key = key.next(); } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index d3d2c6d9b2..9a6b99d991 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -34,6 +34,7 @@ use crate::pgdatadir_mapping::*; use crate::reltag::{RelTag, SlruKind}; use crate::tenant::Timeline; use crate::walrecord::*; +use crate::ZERO_PAGE; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment; @@ -43,8 +44,6 @@ use postgres_ffi::TransactionId; use postgres_ffi::BLCKSZ; use utils::lsn::Lsn; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - pub struct WalIngest<'a> { timeline: &'a Timeline,