From ac60b68d5070b8f1c5533cb076affe3a7c43f2e6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 31 May 2021 23:36:17 +0300 Subject: [PATCH] Handle VM and FSM truncation WAL records in the page server. Fixes issue #190. Original patch by Konstantin Knizhnik. --- pageserver/src/restore_local_repo.rs | 97 ++++++++++++++----- pageserver/src/waldecoder.rs | 4 +- postgres_ffi/src/pg_constants.rs | 2 + .../expected/zenith-rel-truncate.out | 26 +++++ .../sql/zenith-rel-truncate.sql | 28 ++++++ 5 files changed, 129 insertions(+), 28 deletions(-) diff --git a/pageserver/src/restore_local_repo.rs b/pageserver/src/restore_local_repo.rs index 3cfebe7e63..7f47f7899c 100644 --- a/pageserver/src/restore_local_repo.rs +++ b/pageserver/src/restore_local_repo.rs @@ -281,27 +281,12 @@ pub fn save_decoded_record( && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&decoded); - if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 { - let rel = RelTag { - spcnode: truncate.rnode.spcnode, - dbnode: truncate.rnode.dbnode, - relnode: truncate.rnode.relnode, - forknum: pg_constants::MAIN_FORKNUM, - }; - timeline.put_truncation(rel, lsn, truncate.blkno)?; - } + save_xlog_smgr_truncate(timeline, lsn, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&decoded); - save_create_database( - timeline, - lsn, - createdb.db_id, - createdb.tablespace_id, - createdb.src_db_id, - createdb.src_tablespace_id, - )?; + save_xlog_dbase_create(timeline, lsn, &createdb)?; } // Now that this record has been handled, let the repository know that @@ -311,14 +296,12 @@ pub fn save_decoded_record( } /// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record. -fn save_create_database( - timeline: &dyn Timeline, - lsn: Lsn, - db_id: Oid, - tablespace_id: Oid, - src_db_id: Oid, - src_tablespace_id: Oid, -) -> Result<()> { +fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> { + let db_id = rec.db_id; + let tablespace_id = rec.tablespace_id; + let src_db_id = rec.src_db_id; + let src_tablespace_id = rec.src_tablespace_id; + // Creating a database is implemented by copying the template (aka. source) database. // To copy all the relations, we need to ask for the state as of the same LSN, but we // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for @@ -328,7 +311,7 @@ fn save_create_database( let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?; - info!("creatdb: {} rels", rels.len()); + trace!("save_create_database: {} rels", rels.len()); let mut num_rels_copied = 0; let mut num_blocks_copied = 0; @@ -376,3 +359,65 @@ fn save_create_database( ); Ok(()) } + +/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record. +/// +/// This is the same logic as in PostgreSQL's smgr_redo() function. +fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> { + let spcnode = rec.rnode.spcnode; + let dbnode = rec.rnode.dbnode; + let relnode = rec.rnode.relnode; + + if (rec.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 { + let rel = RelTag { + spcnode, + dbnode, + relnode, + forknum: pg_constants::MAIN_FORKNUM, + }; + timeline.put_truncation(rel, lsn, rec.blkno)?; + } + if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 { + let rel = RelTag { + spcnode, + dbnode, + relnode, + forknum: pg_constants::FSM_FORKNUM, + }; + + // FIXME: 'blkno' stored in the WAL record is the new size of the + // heap. The formula for calculating the new size of the FSM is + // pretty complicated (see FreeSpaceMapPrepareTruncateRel() in + // PostgreSQL), and we should also clear bits in the tail FSM block, + // and update the upper level FSM pages. None of that has been + // implemented. What we do instead, is always just truncate the FSM + // to zero blocks. That's bad for performance, but safe. (The FSM + // isn't needed for correctness, so we could also leave garbage in + // it. Seems more tidy to zap it away.) + if rec.blkno != 0 { + info!("Partial truncation of FSM is not supported"); + } + let num_fsm_blocks = 0; + timeline.put_truncation(rel, lsn, num_fsm_blocks)?; + } + if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 { + let rel = RelTag { + spcnode, + dbnode, + relnode, + forknum: pg_constants::VISIBILITYMAP_FORKNUM, + }; + + // FIXME: Like with the FSM above, the logic to truncate the VM + // correctly has not been implemented. Just zap it away completely, + // always. Unlike the FSM, the VM must never have bits incorrectly + // set. From a correctness point of view, it's always OK to clear + // bits or remove it altogether, though. + if rec.blkno != 0 { + info!("Partial truncation of VM is not supported"); + } + let num_vm_blocks = 0; + timeline.put_truncation(rel, lsn, num_vm_blocks)?; + } + Ok(()) +} diff --git a/pageserver/src/waldecoder.rs b/pageserver/src/waldecoder.rs index b726bf5a0d..28ad0221a5 100644 --- a/pageserver/src/waldecoder.rs +++ b/pageserver/src/waldecoder.rs @@ -264,7 +264,7 @@ pub struct DecodedBkpBlock { /* Information on full-page image, if any */ has_image: bool, /* has image, even for consistency checking */ pub apply_image: bool, /* has image that should be restored */ - pub will_init: bool, + pub will_init: bool, /* record doesn't need previous page version to apply */ //char *bkp_image; hole_offset: u16, hole_length: u16, @@ -850,7 +850,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { let spcnode = buf.get_u32_le(); let dbnode = buf.get_u32_le(); let relnode = buf.get_u32_le(); - //TODO save these too + //TODO handle this too? trace!( "XLOG_XACT_ABORT relfilenode {}/{}/{}", spcnode, diff --git a/postgres_ffi/src/pg_constants.rs b/postgres_ffi/src/pg_constants.rs index 16f603acca..bff7683055 100644 --- a/postgres_ffi/src/pg_constants.rs +++ b/postgres_ffi/src/pg_constants.rs @@ -20,6 +20,8 @@ pub const ROCKSDB_SPECIAL_FORKNUM: u8 = 50; // From storage_xlog.h pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; +pub const SMGR_TRUNCATE_VM: u32 = 0x0002; +pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; // // Constants from visbilitymap.h diff --git a/test_runner/zenith_regress/expected/zenith-rel-truncate.out b/test_runner/zenith_regress/expected/zenith-rel-truncate.out index ed53357c4a..f24f39ea60 100644 --- a/test_runner/zenith_regress/expected/zenith-rel-truncate.out +++ b/test_runner/zenith_regress/expected/zenith-rel-truncate.out @@ -17,3 +17,29 @@ SELECT * FROM truncatetest; (0 rows) DROP TABLE truncatetest; +-- +-- Test that the FSM is truncated along with the table. +-- +-- Create a test table and delete and vacuum away most of the rows. +-- This leaves the FSM full of pages with plenty of space +create table tt(i int); +insert into tt select g from generate_series(1, 100000) g; +delete from tt where i%100 != 0 and i > 10000; +vacuum freeze tt; +-- Delete the rest of the rows, and vacuum again. This truncates the +-- heap to 0 blocks, and should also truncate the FSM. +delete from tt; +vacuum tt; +-- This can be used to look at the FSM directly, if the 'pg_freespace' contrib module +-- is installed +--SELECT blkno, avail from generate_series(1, 450) blkno, pg_freespace('tt'::regclass, blkno) AS avail; +-- Insert a row again. It should go on block #0. If the FSM was not truncated, +-- the insertion would find a higher-numbered block in the FSM and use that instead. +insert into tt values (0); +select ctid, * from tt; + ctid | i +-------+--- + (0,1) | 0 +(1 row) + +drop table tt; diff --git a/test_runner/zenith_regress/sql/zenith-rel-truncate.sql b/test_runner/zenith_regress/sql/zenith-rel-truncate.sql index 7a35cad3ab..d13b0fb19e 100644 --- a/test_runner/zenith_regress/sql/zenith-rel-truncate.sql +++ b/test_runner/zenith_regress/sql/zenith-rel-truncate.sql @@ -16,3 +16,31 @@ VACUUM truncatetest; SELECT * FROM truncatetest; DROP TABLE truncatetest; + + +-- +-- Test that the FSM is truncated along with the table. +-- + +-- Create a test table and delete and vacuum away most of the rows. +-- This leaves the FSM full of pages with plenty of space +create table tt(i int); +insert into tt select g from generate_series(1, 100000) g; +delete from tt where i%100 != 0 and i > 10000; +vacuum freeze tt; + +-- Delete the rest of the rows, and vacuum again. This truncates the +-- heap to 0 blocks, and should also truncate the FSM. +delete from tt; +vacuum tt; + +-- This can be used to look at the FSM directly, if the 'pg_freespace' contrib module +-- is installed +--SELECT blkno, avail from generate_series(1, 450) blkno, pg_freespace('tt'::regclass, blkno) AS avail; + +-- Insert a row again. It should go on block #0. If the FSM was not truncated, +-- the insertion would find a higher-numbered block in the FSM and use that instead. +insert into tt values (0); +select ctid, * from tt; + +drop table tt;