mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 22:12:56 +00:00
Fix various bugs caused by switch to new storage model
This commit is contained in:
@@ -123,13 +123,12 @@ impl<'a> Basebackup<'a> {
|
||||
tag: &ObjectTag,
|
||||
page: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn(*tag, self.lsn)?;
|
||||
let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
|
||||
// Zero length image indicates truncated segment: just skip it
|
||||
if !img.is_empty() {
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if self.slru_segno != segno || self.slru_path != path {
|
||||
if self.slru_path != "" && (self.slru_segno != segno || self.slru_path != path) {
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
@@ -158,7 +157,8 @@ impl<'a> Basebackup<'a> {
|
||||
// Extract pg_filenode.map files from repository
|
||||
//
|
||||
fn add_relmap_file(&mut self, tag: &ObjectTag, db: &DatabaseTag) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn(*tag, self.lsn)?;
|
||||
let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
|
||||
info!("add_relmap_file {:?}", db);
|
||||
let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
String::from("global/pg_filenode.map")
|
||||
} else {
|
||||
@@ -179,7 +179,7 @@ impl<'a> Basebackup<'a> {
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, tag: &ObjectTag, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn(*tag, self.lsn)?;
|
||||
let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
@@ -197,13 +197,12 @@ impl<'a> Basebackup<'a> {
|
||||
let most_recent_lsn = Lsn(0);
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(ObjectTag::Checkpoint, most_recent_lsn)?;
|
||||
.get_page_at_lsn_nowait(ObjectTag::Checkpoint, most_recent_lsn)?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(ObjectTag::ControlFile, most_recent_lsn)?;
|
||||
.get_page_at_lsn_nowait(ObjectTag::ControlFile, most_recent_lsn)?;
|
||||
let mut pg_control = postgres_ffi::decode_pg_control(pg_control_bytes)?;
|
||||
let mut checkpoint = postgres_ffi::decode_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
// Here starts pg_resetwal inspired magic
|
||||
// Generate new pg_control and WAL needed for bootstrap
|
||||
let new_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE) + 1;
|
||||
@@ -241,7 +240,7 @@ impl<'a> Basebackup<'a> {
|
||||
let hdr = XLogLongPageHeaderData {
|
||||
std: {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC,
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_tli: 1, // FIXME: always use Postgres timeline 1
|
||||
xlp_pageaddr: pg_control.checkPointCopy.redo - SizeOfXLogLongPHD as u64,
|
||||
|
||||
@@ -127,9 +127,6 @@ pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
|
||||
force_crash_recovery(&tmppath)?;
|
||||
println!("updated pg_control");
|
||||
|
||||
// Move the data directory as an initial base backup.
|
||||
// FIXME: It would be enough to only copy the non-relation files here, the relation
|
||||
// data was already loaded into the repository.
|
||||
@@ -345,6 +342,7 @@ fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
|
||||
bail!("could not parse point-in-time {}", s);
|
||||
}
|
||||
|
||||
<<<<<<< HEAD
|
||||
// If control file says the cluster was shut down cleanly, modify it, to mark
|
||||
// it as crashed. That forces crash recovery when you start the cluster.
|
||||
//
|
||||
@@ -366,6 +364,8 @@ fn force_crash_recovery(datadir: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
=======
|
||||
>>>>>>> Fix various bugs caused by switch to new storage model
|
||||
fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
|
||||
@@ -138,8 +138,7 @@ impl Repository for ObjectRepository {
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, at_lsn: Lsn) -> Result<()> {
|
||||
// just to check the source timeline exists
|
||||
let _ = self.get_timeline(src)?;
|
||||
let src_timeline = self.get_timeline(src)?;
|
||||
|
||||
// Write a metadata key, noting the ancestor of th new timeline. There is initially
|
||||
// no data in it, but all the read-calls know to look into the ancestor.
|
||||
@@ -156,6 +155,19 @@ impl Repository for ObjectRepository {
|
||||
&ObjectValue::ser(&val)?,
|
||||
)?;
|
||||
|
||||
// Copy non-rel objects
|
||||
for tag in src_timeline.list_nonrels(at_lsn)? {
|
||||
match tag {
|
||||
ObjectTag::TimelineMetadataTag => {} // skip it
|
||||
_ => {
|
||||
let img = src_timeline.get_page_at_lsn_nowait(tag, at_lsn)?;
|
||||
let val = ObjectValue::Page(img);
|
||||
let key = ObjectKey { timeline: dst, tag };
|
||||
let lsn = if tag.is_versioned() { at_lsn } else { Lsn(0) };
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -255,6 +267,50 @@ impl Timeline for ObjectTimeline {
|
||||
self.get_page_at_lsn_nowait(tag, lsn)
|
||||
}
|
||||
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes> {
|
||||
// Look up the page entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let searchkey = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
let mut iter = self.object_versions(&*self.obj_store, &searchkey, lsn)?;
|
||||
|
||||
if let Some((version_lsn, value)) = iter.next().transpose()? {
|
||||
let page_img: Bytes;
|
||||
|
||||
match ObjectValue::des(&value)? {
|
||||
ObjectValue::Page(img) => {
|
||||
page_img = img;
|
||||
}
|
||||
ObjectValue::WALRecord(_rec) => {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn)?;
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
|
||||
self.put_page_image(tag, lsn, page_img.clone())?;
|
||||
}
|
||||
x => bail!("Unexpected object value: {:?}", x),
|
||||
}
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
trace!(
|
||||
"Returning page with LSN {:X}/{:X} for {:?} from {} (request {})",
|
||||
page_lsn_hi,
|
||||
page_lsn_lo,
|
||||
tag,
|
||||
version_lsn,
|
||||
lsn
|
||||
);
|
||||
return Ok(page_img);
|
||||
}
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
trace!("page {:?} at {} not found", tag, lsn);
|
||||
Ok(Bytes::from_static(&ZERO_PAGE))
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
|
||||
/// Get size of relation
|
||||
fn get_rel_size(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
|
||||
let lsn = self.wait_lsn(lsn)?;
|
||||
@@ -553,50 +609,6 @@ impl Timeline for ObjectTimeline {
|
||||
}
|
||||
|
||||
impl ObjectTimeline {
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes> {
|
||||
// Look up the page entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let searchkey = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
let mut iter = self.object_versions(&*self.obj_store, &searchkey, lsn)?;
|
||||
|
||||
if let Some((version_lsn, value)) = iter.next().transpose()? {
|
||||
let page_img: Bytes;
|
||||
|
||||
match ObjectValue::des(&value)? {
|
||||
ObjectValue::Page(img) => {
|
||||
page_img = img;
|
||||
}
|
||||
ObjectValue::WALRecord(_rec) => {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn)?;
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
|
||||
self.put_page_image(tag, lsn, page_img.clone())?;
|
||||
}
|
||||
x => bail!("Unexpected object value: {:?}", x),
|
||||
}
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
trace!(
|
||||
"Returning page with LSN {:X}/{:X} for {:?} from {} (request {})",
|
||||
page_lsn_hi,
|
||||
page_lsn_lo,
|
||||
tag,
|
||||
version_lsn,
|
||||
lsn
|
||||
);
|
||||
return Ok(page_img);
|
||||
}
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
trace!("page {:?} at {} not found", tag, lsn);
|
||||
Ok(Bytes::from_static(&ZERO_PAGE))
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
|
||||
///
|
||||
/// Internal function to get relation size at given LSN.
|
||||
///
|
||||
|
||||
@@ -38,6 +38,9 @@ pub trait Timeline: Send + Sync {
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Get size of relation
|
||||
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
@@ -273,6 +276,17 @@ pub enum ObjectTag {
|
||||
RelationBuffer(BufferTag),
|
||||
}
|
||||
|
||||
impl ObjectTag {
|
||||
pub fn is_versioned(&self) -> bool {
|
||||
match self {
|
||||
ObjectTag::Checkpoint => false,
|
||||
ObjectTag::ControlFile => false,
|
||||
ObjectTag::TimelineMetadataTag => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
|
||||
@@ -254,7 +254,6 @@ fn import_slru_file(
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
@@ -291,9 +290,13 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let pg_control_bytes = timeline.get_page_at_lsn(ObjectTag::ControlFile, Lsn(0))?;
|
||||
let pg_control = decode_pg_control(pg_control_bytes)?;
|
||||
let mut checkpoint = pg_control.checkPointCopy.clone();
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, Lsn(0))?;
|
||||
let mut checkpoint = decode_checkpoint(checkpoint_bytes)?;
|
||||
if checkpoint.nextXid.value == 0 {
|
||||
let pg_control_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, Lsn(0))?;
|
||||
let pg_control = decode_pg_control(pg_control_bytes)?;
|
||||
checkpoint = pg_control.checkPointCopy;
|
||||
}
|
||||
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
@@ -375,14 +378,17 @@ pub fn save_decoded_record(
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
timeline.put_wal_record(blk.tag, rec)?;
|
||||
if blk.will_drop {
|
||||
timeline.put_unlink(blk.tag, lsn)?;
|
||||
} else {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(blk.tag, rec)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle a few special record types
|
||||
@@ -447,9 +453,9 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
blknum,
|
||||
});
|
||||
|
||||
let content = timeline.get_page_at_lsn(src_key, req_lsn)?;
|
||||
let content = timeline.get_page_at_lsn_nowait(src_key, req_lsn)?;
|
||||
|
||||
info!("copying block {:?} to {:?}", src_key, dst_key);
|
||||
debug!("copying block {:?} to {:?}", src_key, dst_key);
|
||||
|
||||
timeline.put_page_image(dst_key, lsn, content)?;
|
||||
num_blocks_copied += 1;
|
||||
@@ -462,6 +468,23 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
|
||||
num_rels_copied += 1;
|
||||
}
|
||||
// Copy relfilemap
|
||||
for tag in timeline.list_nonrels(req_lsn)? {
|
||||
match tag {
|
||||
ObjectTag::FileNodeMap(db) => {
|
||||
if db.spcnode == src_tablespace_id && db.dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, req_lsn)?;
|
||||
let new_tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
});
|
||||
timeline.put_page_image(new_tag, lsn, img)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {} // do nothing
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Created database {}/{}, copied {} blocks in {} rels at {}",
|
||||
tablespace_id, db_id, num_blocks_copied, num_rels_copied, lsn
|
||||
|
||||
@@ -903,7 +903,7 @@ pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedW
|
||||
});
|
||||
blk.will_init = true;
|
||||
blocks.push(blk);
|
||||
info!("Prepare transaction {}", xlogrec.xl_xid);
|
||||
debug!("Prepare transaction {}", xlogrec.xl_xid);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
|
||||
@@ -174,7 +174,7 @@ fn walreceiver_main(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn(ObjectTag::Checkpoint, Lsn(0))?;
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, Lsn(0))?;
|
||||
let mut checkpoint = decode_checkpoint(checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
|
||||
@@ -305,7 +305,11 @@ impl PostgresRedoManagerInternal {
|
||||
let mut status = 0;
|
||||
let tag_blknum = match tag {
|
||||
ObjectTag::Clog(slru) => slru.blknum,
|
||||
_ => panic!("Not CLOG object tag"),
|
||||
ObjectTag::TwoPhase(_) => {
|
||||
assert!(info == pg_constants::XLOG_XACT_PREPARE);
|
||||
0 // not used by XLOG_XACT_PREPARE
|
||||
}
|
||||
_ => panic!("Not valid XACT object tag {:?}", tag),
|
||||
};
|
||||
if info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
|
||||
Reference in New Issue
Block a user