From f72d4814b1b5e8b972f33bcd634ea402d4fa8cee Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 30 Nov 2021 12:57:26 +0300 Subject: [PATCH] Extract page images from FPI WAL records (#949) * Extract page images from FPI WAL records * Fix issues reported in review --- pageserver/src/restore_local_repo.rs | 45 +++++++++++++++++++++++----- pageserver/src/waldecoder.rs | 24 +++++++++++---- vendor/postgres | 2 +- 3 files changed, 56 insertions(+), 15 deletions(-) diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index 8afa2676e2..92429a20ad 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -11,7 +11,7 @@ use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; use anyhow::{anyhow, bail, Result}; -use bytes::{Buf, Bytes}; +use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use crate::relish::*; @@ -416,7 +416,6 @@ pub fn save_decoded_record( if checkpoint.update_next_xid(decoded.xl_xid) { *checkpoint_modified = true; } - // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { @@ -427,13 +426,43 @@ pub fn save_decoded_record( forknum: blk.forknum as u8, }); - let rec = WALRecord { - will_init: blk.will_init || blk.apply_image, - rec: recdata.clone(), - main_data_offset: decoded.main_data_offset as u32, - }; + // + // Instead of storing full-page-image WAL record, + // it is better to store extracted image: we can skip wal-redo + // in this case. Also some FPI records may contain multiple (up to 32) pages, + // so them have to be copied multiple times. + // + if blk.apply_image + && blk.has_image + && decoded.xl_rmid == pg_constants::RM_XLOG_ID + && (decoded.xl_info == pg_constants::XLOG_FPI + || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) + // compression of WAL is not yet supported: fall back to storing the original WAL record + && (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0 + { + // Extract page image from FPI record + let img_len = blk.bimg_len as usize; + let img_offs = blk.bimg_offset as usize; + let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize); + image.extend_from_slice(&recdata[img_offs..img_offs + img_len]); - timeline.put_wal_record(lsn, tag, blk.blkno, rec)?; + if blk.hole_length != 0 { + let tail = image.split_off(blk.hole_offset as usize); + image.resize(image.len() + blk.hole_length as usize, 0u8); + image.unsplit(tail); + } + image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); + image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); + assert_eq!(image.len(), pg_constants::BLCKSZ as usize); + timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?; + } else { + let rec = WALRecord { + will_init: blk.will_init || blk.apply_image, + rec: recdata.clone(), + main_data_offset: decoded.main_data_offset as u32, + }; + timeline.put_wal_record(lsn, tag, blk.blkno, rec)?; + } } let mut buf = decoded.record.clone(); diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index b1e8e3b54f..20e04bcad9 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -229,17 +229,18 @@ pub struct DecodedBkpBlock { pub blkno: u32, /* copy of the fork_flags field from the XLogRecordBlockHeader */ - flags: u8, + pub flags: u8, /* Information on full-page image, if any */ - has_image: bool, /* has image, even for consistency checking */ + pub has_image: bool, /* has image, even for consistency checking */ pub apply_image: bool, /* has image that should be restored */ pub will_init: bool, /* record doesn't need previous page version to apply */ //char *bkp_image; - hole_offset: u16, - hole_length: u16, - bimg_len: u16, - bimg_info: u8, + pub hole_offset: u16, + pub hole_length: u16, + pub bimg_offset: u32, + pub bimg_len: u16, + pub bimg_info: u8, /* Buffer holding the rmgr-specific data associated with this block */ has_data: bool, @@ -859,8 +860,19 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { } // 3. Decode blocks. + let mut ptr = record.len() - buf.remaining(); + for blk in blocks.iter_mut() { + if blk.has_image { + blk.bimg_offset = ptr as u32; + ptr += blk.bimg_len as usize; + } + if blk.has_data { + ptr += blk.data_len as usize; + } + } // We don't need them, so just skip blocks_total_len bytes buf.advance(blocks_total_len as usize); + assert_eq!(ptr, record.len() - buf.remaining()); let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize; diff --git a/vendor/postgres b/vendor/postgres index 9f26cc64b2..08878b19d3 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 9f26cc64b2e99ff9f5d73ab901541f4ed3cda955 +Subproject commit 08878b19d3cae5a1bd765bf7396187b6b806c6ac