Set proper xl_prev in basebackup, when possible.

In a passing fix two minor issues with basabackup:
* check that we can't create branches with pre-initdb LSN's
* normalize branch LSN's that are pointing to the segment boundary

patch by @knizhnik
closes #506
This commit is contained in:
Konstantin Knizhnik
2021-09-01 17:04:21 +03:00
committed by Stas Kelvich
parent 45c09c1cdd
commit b227c63edf
7 changed files with 104 additions and 23 deletions

View File

@@ -47,28 +47,45 @@ impl<'a> Basebackup<'a> {
timeline: &'a Arc<dyn Timeline>,
req_lsn: Option<Lsn>,
) -> Basebackup<'a> {
// current_prev may be zero if we are at the start of timeline branched from old lsn
let RecordLsn {
last: lsn,
prev: prev_record_lsn,
} = if let Some(lsn) = req_lsn {
// FIXME: that wouldn't work since we don't know prev for old LSN's.
// Probably it is better to avoid using prev in compute node start
// at all and accept the fact that first WAL record in the timeline would
// have zero as prev. https://github.com/zenithdb/zenith/issues/506
RecordLsn {
last: lsn,
prev: lsn,
last: current_last,
prev: current_prev,
} = timeline.get_last_record_rlsn();
// Compute postgres doesn't have any previous WAL files, but the first record that this
// postgres is going to write need to have LSN of previous record (xl_prev). So we are
// writing prev_lsn to "zenith.signal" file so that postgres can read it during the start.
// In some cases we don't know prev_lsn (branch or basebackup @old_lsn) so pass Lsn(0)
// instead and embrace the wrong xl_prev in this situations.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
if req_lsn > current_last {
// FIXME: now wait_lsn() is inside of list_nonrels() so we don't have a way
// to get it from there. It is better to wait just here.
(Lsn(0), req_lsn)
} else if req_lsn < current_last {
// we don't know prev already. We don't currently use basebackup@old_lsn
// but may use it for read only replicas in future
(Lsn(0), req_lsn)
} else {
// we are exactly at req_lsn and know prev
(current_prev, req_lsn)
}
} else {
// Atomically get last and prev LSN's
timeline.get_last_record_rlsn()
// None in req_lsn means that we are branching from the latest LSN
(current_prev, current_last)
};
info!(
"taking basebackup lsn={}, prev_lsn={}",
backup_prev, backup_lsn
);
Basebackup {
ar: Builder::new(write),
timeline,
lsn,
prev_record_lsn,
lsn: backup_lsn,
prev_record_lsn: backup_prev,
}
}
@@ -236,7 +253,7 @@ impl<'a> Basebackup<'a> {
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
pg_constants::WAL_SEGMENT_SIZE,
);
checkpoint.redo = self.lsn.0 + self.lsn.calc_padding(8u32);
checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;
//reset some fields we don't want to preserve
//TODO Check this.
@@ -249,9 +266,14 @@ impl<'a> Basebackup<'a> {
pg_control.state = pg_constants::DB_SHUTDOWNED;
// add zenith.signal file
let xl_prev = if self.prev_record_lsn == Lsn(0) {
0xBAD0 // magic value to indicate that we don't know prev_lsn
} else {
self.prev_record_lsn.0
};
self.ar.append(
&new_tar_header("zenith.signal", 8)?,
&self.prev_record_lsn.0.to_le_bytes()[..],
&xl_prev.to_le_bytes()[..],
)?;
//send pg_control

View File

@@ -264,15 +264,22 @@ pub(crate) fn create_branch(
}
let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?;
let timeline = repo.get_timeline(startpoint.timelineid)?;
if startpoint.lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = repo
.get_timeline(startpoint.timelineid)?
.get_last_record_lsn();
let end_of_wal = timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
startpoint.lsn = end_of_wal;
}
startpoint.lsn = startpoint.lsn.align();
if timeline.get_start_lsn() > startpoint.lsn {
anyhow::bail!(
"invalid startpoint {} for the branch {}: less than timeline start {}",
startpoint.lsn,
branchname,
timeline.get_start_lsn()
);
}
// create a new timeline directory for it
let newtli = create_timeline(conf, Some(startpoint), tenantid)?;

View File

@@ -150,12 +150,24 @@ impl Repository for LayeredRepository {
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
let src_timeline = self.get_timeline(src)?;
let RecordLsn {
last: src_last,
prev: src_prev,
} = src_timeline.get_last_record_rlsn();
// Use src_prev from the source timeline only if we branched at the last record.
let dst_prev = if src_last == start_lsn {
Some(src_prev)
} else {
None
};
// Create the metadata file, noting the ancestor of the new timeline.
// There is initially no data in it, but all the read-calls know to look
// into the ancestor.
let metadata = TimelineMetadata {
disk_consistent_lsn: start_lsn,
prev_record_lsn: Some(src_timeline.get_prev_record_lsn()), // FIXME not atomic with start_lsn
prev_record_lsn: dst_prev,
ancestor_timeline: Some(src),
ancestor_lsn: start_lsn,
};
@@ -782,6 +794,14 @@ impl Timeline for LayeredTimeline {
fn get_last_record_rlsn(&self) -> RecordLsn {
self.last_record_lsn.load()
}
fn get_start_lsn(&self) -> Lsn {
if let Some(ancestor) = self.ancestor_timeline.as_ref() {
ancestor.get_start_lsn()
} else {
self.ancestor_lsn
}
}
}
impl LayeredTimeline {

View File

@@ -146,6 +146,7 @@ pub trait Timeline: Send + Sync {
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
fn get_last_record_lsn(&self) -> Lsn;
fn get_prev_record_lsn(&self) -> Lsn;
fn get_start_lsn(&self) -> Lsn;
///
/// Flush to disk all data that was written with the put_* functions

View File

@@ -26,6 +26,7 @@ use std::fs::{self, File};
use std::io::prelude::*;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use zenith_utils::lsn::Lsn;
pub const XLOG_FNAME_LEN: usize = 24;
pub const XLOG_BLCKSZ: usize = 8192;
@@ -89,6 +90,21 @@ pub fn IsPartialXLogFileName(fname: &str) -> bool {
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
}
/// If LSN points to the beginning of the page, then shift it to first record,
/// otherwise align on 8-bytes boundary (required for WAL records)
pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
if lsn.0 % XLOG_BLCKSZ as u64 == 0 {
let hdr_size = if lsn.0 % seg_sz as u64 == 0 {
XLOG_SIZE_OF_XLOG_LONG_PHD
} else {
XLOG_SIZE_OF_XLOG_SHORT_PHD
};
lsn + hdr_size as u64
} else {
lsn.align()
}
}
pub fn get_current_timestamp() -> TimestampTz {
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
@@ -416,7 +432,6 @@ mod tests {
use super::*;
use regex::Regex;
use std::{env, process::Command, str::FromStr};
use zenith_utils::lsn::Lsn;
// Run find_end_of_wal against file in test_wal dir
// Ensure that it finds last record correctly

View File

@@ -1,3 +1,4 @@
import subprocess
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
@@ -74,3 +75,18 @@ def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: Postg
# All the rows are visible on the main branch
main_cur.execute('SELECT count(*) FROM foo')
assert main_cur.fetchone() == (200100, )
# Check bad lsn's for branching
# branch at segment boundary
zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
pg = postgres.create_start("test_branch_segment_boundary")
cur = pg.connect().cursor()
cur.execute('SELECT 1')
assert cur.fetchone() == (1, )
# branch at pre-initdb lsn
try:
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
except subprocess.CalledProcessError:
print("Branch creation with pre-initdb LSN failed (as expected)")