From 0128948d8d5b0e3079482139200da2612d6ff67e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 4 Dec 2024 15:56:41 +0200 Subject: [PATCH] Abuse Checkpoint.redo field to distinguish shutdown checkpoints --- libs/postgres_ffi/src/lib.rs | 5 +- libs/postgres_ffi/src/pg_constants.rs | 2 - libs/postgres_ffi/src/xlog_utils.rs | 49 ++++++++++++++++++-- pageserver/src/basebackup.rs | 67 ++++++++++++++------------- pageserver/src/walingest.rs | 47 ++++++++++++++++--- 5 files changed, 121 insertions(+), 49 deletions(-) diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 7a3260c4e8..301bc2f16e 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -231,8 +231,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo}; pub use v14::bindings::{MultiXactOffset, MultiXactStatus}; pub use v14::bindings::{PageHeaderData, XLogRecord}; pub use v14::xlog_utils::{ - SIZEOF_CHECKPOINT, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, - XLOG_SIZE_OF_XLOG_SHORT_PHD, + XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD, }; pub use v14::bindings::{CheckPoint, ControlFileData}; @@ -279,7 +278,7 @@ pub fn generate_pg_control( checkpoint_bytes: &[u8], lsn: Lsn, pg_version: u32, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { dispatch_pgversion!( pg_version, pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn), diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index b219576f7e..e343473d77 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -211,8 +211,6 @@ pub const BKPBLOCK_HAS_DATA: u8 = 0x20; pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */ -pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 2; - /* Information stored in bimg_info */ pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */ diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 852b20eace..e32e44dd00 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -124,23 +124,64 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { } } +/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN +/// +/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in +/// the pageserver. They use the same format as the PostgreSQL control file and the +/// checkpoint record, but see walingest.rs for how exactly they are kept up to date. +/// 'lsn' is the LSN at which we're starting up. +/// +/// Returns: +/// - pg_control file contents +/// - system_identifier, extracted from the persisted information +/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown +/// checkpoint at the given LSN pub fn generate_pg_control( pg_control_bytes: &[u8], checkpoint_bytes: &[u8], lsn: Lsn, -) -> anyhow::Result<(Bytes, u64)> { +) -> anyhow::Result<(Bytes, u64, bool)> { let mut pg_control = ControlFileData::decode(pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?; + let was_shutdown; // Generate new pg_control needed for bootstrap - checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + // + // NB: In the checkpoint struct that we persist in the pageserver, we have a different + // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint, + // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to + // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0. + // + // We didn't always have this convention however, and old persisted records will have + // old REDO values that point to some old LSN. + // + // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown + // checkpoint record at that point in WAL, with no new WAL records after it. That case + // can be treated as starting from a clean shutdown. All other cases are treated as + // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so + // that distinction doesn't matter very much. As of this writing, it only affects + // whether the persisted pg_stats information can be used or not. + // + // In the Checkpoint struct in the returned pg_control file, the redo pointer is + // always set to the LSN we're starting at, to hint that no WAL replay is required. + // (There's some neon-specific code in Postgres startup to make that work, though. + // Just setting the redo pointer is not sufficient.) + if Lsn(checkpoint.redo) == lsn { + was_shutdown = true; + } else { + checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0; + was_shutdown = false; + } - //save new values in pg_control + // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown. The + // neon-specific code at postgres startup ignores the state stored in the control + // file, similar to archive recovery in standalone PostgreSQL. Similarly, the + // checkPoint pointer is ignored, so just set it to 0. pg_control.checkPoint = 0; pg_control.checkPointCopy = checkpoint; pg_control.state = DBState_DB_SHUTDOWNED; - Ok((pg_control.encode(), pg_control.system_identifier)) + Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown)) } pub fn get_current_timestamp() -> TimestampTz { diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index d6630c4aee..c4e2583555 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -28,12 +28,12 @@ use crate::pgdatadir_mapping::Version; use crate::tenant::Timeline; use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::dispatch_pgversion; use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PG_HBA}; use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM}; use postgres_ffi::XLogFileName; use postgres_ffi::PG_TLI; -use postgres_ffi::{dispatch_pgversion, CheckPoint}; use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE}; use utils::lsn::Lsn; @@ -255,16 +255,30 @@ where async fn send_tarball(mut self) -> Result<(), BasebackupError> { // TODO include checksum - // Detect if we are creating the basebackup exactly at a shutdown checkpoint. - let normal_shutdown = - if let Ok(checkpoint_bytes) = self.timeline.get_checkpoint(self.lsn, self.ctx).await { - let checkpoint = - CheckPoint::decode(&checkpoint_bytes).context("deserialize checkpoint")?; - // We store "redo" field only for shutdown checkpoint - checkpoint.redo != 0 - } else { - false - }; + // Construct the pg_control file from the persisted checkpoint and pg_control + // information. But we only add this to the tarball at the end, so that if the + // writing is interrupted half-way through, the resulting incomplete tarball will + // be missing the pg_control file, which prevents PostgreSQL from starting up on + // it. With proper error handling, you should never try to start up from an + // incomplete basebackup in the first place, of course, but this is a nice little + // extra safety measure. + let checkpoint_bytes = self + .timeline + .get_checkpoint(self.lsn, self.ctx) + .await + .context("failed to get checkpoint bytes")?; + let pg_control_bytes = self + .timeline + .get_control_file(self.lsn, self.ctx) + .await + .context("failed get control bytes")?; + let (pg_control_bytes, system_identifier, was_shutdown) = + postgres_ffi::generate_pg_control( + &pg_control_bytes, + &checkpoint_bytes, + self.lsn, + self.timeline.pg_version, + )?; let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup; @@ -403,7 +417,7 @@ where // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, // but now we should handle (skip) it for backward compatibility. continue; - } else if path == "pg_stat/pgstat.stat" && !normal_shutdown { + } else if path == "pg_stat/pgstat.stat" && !was_shutdown { // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN // of a shutdown checkpoint. continue; @@ -468,8 +482,9 @@ where ))) }); - // Generate pg_control and bootstrap WAL segment. - self.add_pgcontrol_file().await?; + // Last, add the pg_control file and bootstrap WAL segment. + self.add_pgcontrol_file(pg_control_bytes, system_identifier) + .await?; self.ar.finish().await.map_err(BasebackupError::Client)?; debug!("all tarred up!"); Ok(()) @@ -672,7 +687,11 @@ where // Add generated pg_control file and bootstrap WAL segment. // Also send zenith.signal file with extra bootstrap data. // - async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> { + async fn add_pgcontrol_file( + &mut self, + pg_control_bytes: Bytes, + system_identifier: u64, + ) -> Result<(), BasebackupError> { // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { @@ -695,24 +714,6 @@ where .await .map_err(BasebackupError::Client)?; - let checkpoint_bytes = self - .timeline - .get_checkpoint(self.lsn, self.ctx) - .await - .context("failed to get checkpoint bytes")?; - let pg_control_bytes = self - .timeline - .get_control_file(self.lsn, self.ctx) - .await - .context("failed get control bytes")?; - - let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control( - &pg_control_bytes, - &checkpoint_bytes, - self.lsn, - self.timeline.pg_version, - )?; - //send pg_control let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?; self.ar diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 0d4af0f733..4e115ef1ee 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1188,18 +1188,51 @@ impl WalIngest { } else { cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid; } + // NB: We abuse the Checkpoint.redo field: // - // There is no any field in CheckPoint which allows to distinguish online - // and shutdown checkpoint. And we need to know it to detect normal shutdown. - // Previously "redo" field was not set at all. - // So let's assign this field only for shutdown checkpoint and left it zero - // for online checkpoint. + // - In PostgreSQL, the Checkpoint struct doesn't store the information + // of whether this is an online checkpoint or a shutdown checkpoint. It's + // stored in the XLOG info field of the WAL record, shutdown checkpoints + // use record type XLOG_CHECKPOINT_SHUTDOWN and online checkpoints use + // XLOG_CHECKPOINT_ONLINE. We don't store the original WAL record headers + // in the pageserver, however. // + // - In PostgreSQL, the Checkpoint.redo field stores the *start* of the + // checkpoint record, if it's a shutdown checkpoint. But when we are + // starting from a shutdown checkpoint, the basebackup LSN is the *end* + // of the shutdown checkpoint WAL record. That makes it difficult to + // correctly detect whether we're starting from a shutdown record or + // not. + // + // To address both of those issues, we store 0 in the redo field if it's + // an online checkpoint record, and the record's *end* LSN if it's a + // shutdown checkpoint. We don't need the original redo pointer in neon, + // because we don't perform WAL replay at startup anyway, so we can get + // away with abusing the redo field like this. + // + // XXX: Ideally, we would persist the extra information in a more + // explicit format, rather than repurpose the fields of the Postgres + // struct like this. However, we already have persisted data like this, + // so we need to maintain backwards compatibility. + // + // NB: We didn't originally have this convention, so there are still old + // persisted records that didn't do this. Before, we didn't update the + // persisted redo field at all. That means that old records have a bogus + // redo pointer that points to some old value, from the checkpoint record + // that was originally imported from the data directory. If it was a + // project created in Neon, that means it points to the first checkpoint + // after initdb. That's OK for our purposes: all such old checkpoints are + // treated as old online checkpoints when the basebackup is created. cp.redo = if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN { - xlog_checkpoint.redo + // Store the *end* LSN of the checkpoint record. Or to be precise, + // the start LSN of the *next* record, i.e. if the record ends + // exactly at page boundary, the redo LSN points to just after the + // page header on the next page. + lsn.into() } else { - 0 + Lsn::INVALID.into() }; + // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to // have some trace of the checkpoint records in the layer files at the same