diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index f3dad159be..492ec9748a 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -163,6 +163,27 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) { pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); } +// This is port of function with the same name from freespace.c. +// The only difference is that it does not have "level" parameter because XLogRecordPageWithFreeSpace +// always call it with level=FSM_BOTTOM_LEVEL +pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber { + let mut leafno = addr; + const FSM_TREE_DEPTH: u32 = if pg_constants::SLOTS_PER_FSM_PAGE >= 1626 { + 3 + } else { + 4 + }; + + /* Count upper level nodes required to address the leaf page */ + let mut pages: BlockNumber = 0; + for _l in 0..FSM_TREE_DEPTH { + pages += leafno + 1; + leafno /= pg_constants::SLOTS_PER_FSM_PAGE; + } + /* Turn the page count into 0-based block number */ + pages - 1 +} + pub mod waldecoder { use crate::{v14, v15}; diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 6aaa739a69..09678353af 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -197,6 +197,16 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00; pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10; pub const XLP_LONG_HEADER: u16 = 0x0002; +/* From fsm_internals.h */ +const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4; +const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1; +const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_PER_PAGE; +pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; + +/* From visibilitymap.c */ +pub const VM_HEAPBLOCKS_PER_PAGE: u32 = + (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [ diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 8c81ed824b..f391cdc446 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -24,7 +24,7 @@ use anyhow::Context; use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; -use postgres_ffi::{page_is_new, page_set_lsn}; +use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; @@ -612,20 +612,19 @@ impl<'a> WalIngest<'a> { forknum: FSM_FORKNUM, }; - // FIXME: 'blkno' stored in the WAL record is the new size of the - // heap. The formula for calculating the new size of the FSM is - // pretty complicated (see FreeSpaceMapPrepareTruncateRel() in - // PostgreSQL), and we should also clear bits in the tail FSM block, - // and update the upper level FSM pages. None of that has been - // implemented. What we do instead, is always just truncate the FSM - // to zero blocks. That's bad for performance, but safe. (The FSM - // isn't needed for correctness, so we could also leave garbage in - // it. Seems more tidy to zap it away.) - if rec.blkno != 0 { - info!("Partial truncation of FSM is not supported"); + let fsm_logical_page_no = rec.blkno / pg_constants::SLOTS_PER_FSM_PAGE; + let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no); + if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { + // Tail of last remaining FSM page has to be zeroed. + // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. + modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; + fsm_physical_page_no += 1; + } + let nblocks = self.get_relsize(rel, modification.lsn)?; + if nblocks > fsm_physical_page_no { + // check if something to do: FSM is larger than truncate position + self.put_rel_truncation(modification, rel, fsm_physical_page_no)?; } - let num_fsm_blocks = 0; - self.put_rel_truncation(modification, rel, num_fsm_blocks)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 { let rel = RelTag { @@ -635,16 +634,18 @@ impl<'a> WalIngest<'a> { forknum: VISIBILITYMAP_FORKNUM, }; - // FIXME: Like with the FSM above, the logic to truncate the VM - // correctly has not been implemented. Just zap it away completely, - // always. Unlike the FSM, the VM must never have bits incorrectly - // set. From a correctness point of view, it's always OK to clear - // bits or remove it altogether, though. - if rec.blkno != 0 { - info!("Partial truncation of VM is not supported"); + let mut vm_page_no = rec.blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE; + if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { + // Tail of last remaining vm page has to be zeroed. + // We are not precise here and instead of digging in VM bitmap format just clear the whole page. + modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; + vm_page_no += 1; + } + let nblocks = self.get_relsize(rel, modification.lsn)?; + if nblocks > vm_page_no { + // check if something to do: VM is larger than truncate position + self.put_rel_truncation(modification, rel, vm_page_no)?; } - let num_vm_blocks = 0; - self.put_rel_truncation(modification, rel, num_vm_blocks)?; } Ok(()) } diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py new file mode 100644 index 0000000000..8a45276f5a --- /dev/null +++ b/test_runner/regress/test_truncate.py @@ -0,0 +1,46 @@ +import time + +from fixtures.neon_fixtures import NeonEnvBuilder + + +# +# Test truncation of FSM and VM forks of a relation +# +def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): + + env = neon_env_builder.init_start() + n_records = 10000 + n_iter = 10 + + # Problems with FSM/VM forks truncation are most frequently detected during page reconstruction triggered + # by image layer generation. So adjust default parameters to make it happen more frequently. + tenant, _ = env.neon_cli.create_tenant( + conf={ + "gc_period": "100 m", + "gc_horizon": "1048576", + "checkpoint_distance": "1000000", + "compaction_period": "1 s", + "compaction_threshold": "3", + "image_creation_threshold": "1", + "compaction_target_size": "1000000", + } + ) + + env.neon_cli.create_timeline("test_truncate", tenant_id=tenant) + pg = env.postgres.create_start("test_truncate", tenant_id=tenant) + cur = pg.connect().cursor() + cur.execute("create table t1(x integer)") + cur.execute(f"insert into t1 values (generate_series(1,{n_records}))") + cur.execute("vacuum t1") + for i in range(n_iter): + cur.execute(f"delete from t1 where x>{n_records//2}") + cur.execute("vacuum t1") + time.sleep(1) # let pageserver a chance to create image layers + cur.execute(f"insert into t1 values (generate_series({n_records//2+1}, {n_records}))") + cur.execute("vacuum t1") + time.sleep(1) # let pageserver a chance to create image layers + + cur.execute("select count(*) from t1") + res = cur.fetchone() + assert res is not None + assert res[0] == n_records