diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 8287fee7af..2368c3a60c 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -47,28 +47,45 @@ impl<'a> Basebackup<'a> { timeline: &'a Arc, req_lsn: Option, ) -> Basebackup<'a> { + // current_prev may be zero if we are at the start of timeline branched from old lsn let RecordLsn { - last: lsn, - prev: prev_record_lsn, - } = if let Some(lsn) = req_lsn { - // FIXME: that wouldn't work since we don't know prev for old LSN's. - // Probably it is better to avoid using prev in compute node start - // at all and accept the fact that first WAL record in the timeline would - // have zero as prev. https://github.com/zenithdb/zenith/issues/506 - RecordLsn { - last: lsn, - prev: lsn, + last: current_last, + prev: current_prev, + } = timeline.get_last_record_rlsn(); + + // Compute postgres doesn't have any previous WAL files, but the first record that this + // postgres is going to write need to have LSN of previous record (xl_prev). So we are + // writing prev_lsn to "zenith.signal" file so that postgres can read it during the start. + // In some cases we don't know prev_lsn (branch or basebackup @old_lsn) so pass Lsn(0) + // instead and embrace the wrong xl_prev in this situations. + let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { + if req_lsn > current_last { + // FIXME: now wait_lsn() is inside of list_nonrels() so we don't have a way + // to get it from there. It is better to wait just here. + (Lsn(0), req_lsn) + } else if req_lsn < current_last { + // we don't know prev already. We don't currently use basebackup@old_lsn + // but may use it for read only replicas in future + (Lsn(0), req_lsn) + } else { + // we are exactly at req_lsn and know prev + (current_prev, req_lsn) } } else { - // Atomically get last and prev LSN's - timeline.get_last_record_rlsn() + // None in req_lsn means that we are branching from the latest LSN + (current_prev, current_last) }; + info!( + "taking basebackup lsn={}, prev_lsn={}", + backup_prev, backup_lsn + ); + Basebackup { ar: Builder::new(write), timeline, - lsn, - prev_record_lsn, + lsn: backup_lsn, + prev_record_lsn: backup_prev, } } @@ -236,7 +253,7 @@ impl<'a> Basebackup<'a> { XLOG_SIZE_OF_XLOG_LONG_PHD as u32, pg_constants::WAL_SEGMENT_SIZE, ); - checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32); + checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0; //reset some fields we don't want to preserve //TODO Check this. @@ -249,9 +266,14 @@ impl<'a> Basebackup<'a> { pg_control.state = pg_constants::DB_SHUTDOWNED; // add zenith.signal file + let xl_prev = if self.prev_record_lsn == Lsn(0) { + 0xBAD0 // magic value to indicate that we don't know prev_lsn + } else { + self.prev_record_lsn.0 + }; self.ar.append( &new_tar_header("zenith.signal", 8)?, - &self.prev_record_lsn.0.to_le_bytes()[..], + &xl_prev.to_le_bytes()[..], )?; //send pg_control diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 333b77a8a1..e8ed47a1df 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -264,15 +264,22 @@ pub(crate) fn create_branch( } let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?; - + let timeline = repo.get_timeline(startpoint.timelineid)?; if startpoint.lsn == Lsn(0) { // Find end of WAL on the old timeline - let end_of_wal = repo - .get_timeline(startpoint.timelineid)? - .get_last_record_lsn(); + let end_of_wal = timeline.get_last_record_lsn(); info!("branching at end of WAL: {}", end_of_wal); startpoint.lsn = end_of_wal; } + startpoint.lsn = startpoint.lsn.align(); + if timeline.get_start_lsn() > startpoint.lsn { + anyhow::bail!( + "invalid startpoint {} for the branch {}: less than timeline start {}", + startpoint.lsn, + branchname, + timeline.get_start_lsn() + ); + } // create a new timeline directory for it let newtli = create_timeline(conf, Some(startpoint), tenantid)?; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 4806a0b2a1..e9db64fdfb 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -150,12 +150,24 @@ impl Repository for LayeredRepository { fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> { let src_timeline = self.get_timeline(src)?; + let RecordLsn { + last: src_last, + prev: src_prev, + } = src_timeline.get_last_record_rlsn(); + + // Use src_prev from the source timeline only if we branched at the last record. + let dst_prev = if src_last == start_lsn { + Some(src_prev) + } else { + None + }; + // Create the metadata file, noting the ancestor of the new timeline. // There is initially no data in it, but all the read-calls know to look // into the ancestor. let metadata = TimelineMetadata { disk_consistent_lsn: start_lsn, - prev_record_lsn: Some(src_timeline.get_prev_record_lsn()), // FIXME not atomic with start_lsn + prev_record_lsn: dst_prev, ancestor_timeline: Some(src), ancestor_lsn: start_lsn, }; @@ -782,6 +794,14 @@ impl Timeline for LayeredTimeline { fn get_last_record_rlsn(&self) -> RecordLsn { self.last_record_lsn.load() } + + fn get_start_lsn(&self) -> Lsn { + if let Some(ancestor) = self.ancestor_timeline.as_ref() { + ancestor.get_start_lsn() + } else { + self.ancestor_lsn + } + } } impl LayeredTimeline { diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d6ea346a70..293cf03550 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -146,6 +146,7 @@ pub trait Timeline: Send + Sync { /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. fn get_last_record_lsn(&self) -> Lsn; fn get_prev_record_lsn(&self) -> Lsn; + fn get_start_lsn(&self) -> Lsn; /// /// Flush to disk all data that was written with the put_* functions diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index 064a061195..68ac52e30f 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -26,6 +26,7 @@ use std::fs::{self, File}; use std::io::prelude::*; use std::path::{Path, PathBuf}; use std::time::SystemTime; +use zenith_utils::lsn::Lsn; pub const XLOG_FNAME_LEN: usize = 24; pub const XLOG_BLCKSZ: usize = 8192; @@ -89,6 +90,21 @@ pub fn IsPartialXLogFileName(fname: &str) -> bool { fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]) } +/// If LSN points to the beginning of the page, then shift it to first record, +/// otherwise align on 8-bytes boundary (required for WAL records) +pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn { + if lsn.0 % XLOG_BLCKSZ as u64 == 0 { + let hdr_size = if lsn.0 % seg_sz as u64 == 0 { + XLOG_SIZE_OF_XLOG_LONG_PHD + } else { + XLOG_SIZE_OF_XLOG_SHORT_PHD + }; + lsn + hdr_size as u64 + } else { + lsn.align() + } +} + pub fn get_current_timestamp() -> TimestampTz { const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */ const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */ @@ -416,7 +432,6 @@ mod tests { use super::*; use regex::Regex; use std::{env, process::Command, str::FromStr}; - use zenith_utils::lsn::Lsn; // Run find_end_of_wal against file in test_wal dir // Ensure that it finds last record correctly diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 6b9e91a258..19e5384bba 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -1,3 +1,4 @@ +import subprocess from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver @@ -74,3 +75,18 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg # All the rows are visible on the main branch main_cur.execute('SELECT count(*) FROM foo') assert main_cur.fetchone() == (200100, ) + + # Check bad lsn's for branching + + # branch at segment boundary + zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"]) + pg = postgres.create_start("test_branch_segment_boundary") + cur = pg.connect().cursor() + cur.execute('SELECT 1') + assert cur.fetchone() == (1, ) + + # branch at pre-initdb lsn + try: + zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"]) + except subprocess.CalledProcessError: + print("Branch creation with pre-initdb LSN failed (as expected)") diff --git a/vendor/postgres b/vendor/postgres index 909c606355..3da8bd673b 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 909c606355dcf5b53a8d97e81e28997675de5936 +Subproject commit 3da8bd673b7c3962cb4e455af32394edf677717c