Implement corrent truncation of FSM/VM forks on arbitrary position (#2609)

refer #2601 Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
2026-01-09 14:32:57 +00:00 · 2022-11-23 18:46:07 +02:00
parent 21ec28d9bc
commit a6e4a3c3ef
4 changed files with 101 additions and 23 deletions
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -163,6 +163,27 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
    pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
 }

+// This is port of function with the same name from freespace.c.
+// The only difference is that it does not have "level" parameter because XLogRecordPageWithFreeSpace
+// always call it with level=FSM_BOTTOM_LEVEL
+pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
+    let mut leafno = addr;
+    const FSM_TREE_DEPTH: u32 = if pg_constants::SLOTS_PER_FSM_PAGE >= 1626 {
+        3
+    } else {
+        4
+    };
+
+    /* Count upper level nodes required to address the leaf page */
+    let mut pages: BlockNumber = 0;
+    for _l in 0..FSM_TREE_DEPTH {
+        pages += leafno + 1;
+        leafno /= pg_constants::SLOTS_PER_FSM_PAGE;
+    }
+    /* Turn the page count into 0-based block number */
+    pages - 1
+}
+
 pub mod waldecoder {

    use crate::{v14, v15};
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -197,6 +197,16 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
 pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
 pub const XLP_LONG_HEADER: u16 = 0x0002;

+/* From fsm_internals.h */
+const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
+const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
+const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_PER_PAGE;
+pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
+
+/* From visibilitymap.c */
+pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
+    (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
 // List of subdirectories inside pgdata.
 // Copied from src/bin/initdb/initdb.c
 pub const PGDATA_SUBDIRS: [&str; 22] = [
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -24,7 +24,7 @@
 use anyhow::Context;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
-use postgres_ffi::{page_is_new, page_set_lsn};
+use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};

 use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
@@ -612,20 +612,19 @@ impl<'a> WalIngest<'a> {
                forknum: FSM_FORKNUM,
            };

-            // FIXME: 'blkno' stored in the WAL record is the new size of the
-            // heap. The formula for calculating the new size of the FSM is
-            // pretty complicated (see FreeSpaceMapPrepareTruncateRel() in
-            // PostgreSQL), and we should also clear bits in the tail FSM block,
-            // and update the upper level FSM pages. None of that has been
-            // implemented. What we do instead, is always just truncate the FSM
-            // to zero blocks. That's bad for performance, but safe. (The FSM
-            // isn't needed for correctness, so we could also leave garbage in
-            // it. Seems more tidy to zap it away.)
-            if rec.blkno != 0 {
-                info!("Partial truncation of FSM is not supported");
+            let fsm_logical_page_no = rec.blkno / pg_constants::SLOTS_PER_FSM_PAGE;
+            let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
+            if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
+                // Tail of last remaining FSM page has to be zeroed.
+                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+                modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
+                fsm_physical_page_no += 1;
+            }
+            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            if nblocks > fsm_physical_page_no {
+                // check if something to do: FSM is larger than truncate position
+                self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
            }
-            let num_fsm_blocks = 0;
-            self.put_rel_truncation(modification, rel, num_fsm_blocks)?;
        }
        if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
            let rel = RelTag {
@@ -635,16 +634,18 @@ impl<'a> WalIngest<'a> {
                forknum: VISIBILITYMAP_FORKNUM,
            };

-            // FIXME: Like with the FSM above, the logic to truncate the VM
-            // correctly has not been implemented. Just zap it away completely,
-            // always. Unlike the FSM, the VM must never have bits incorrectly
-            // set. From a correctness point of view, it's always OK to clear
-            // bits or remove it altogether, though.
-            if rec.blkno != 0 {
-                info!("Partial truncation of VM is not supported");
+            let mut vm_page_no = rec.blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
+            if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
+                // Tail of last remaining vm page has to be zeroed.
+                // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
+                modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
+                vm_page_no += 1;
+            }
+            let nblocks = self.get_relsize(rel, modification.lsn)?;
+            if nblocks > vm_page_no {
+                // check if something to do: VM is larger than truncate position
+                self.put_rel_truncation(modification, rel, vm_page_no)?;
            }
-            let num_vm_blocks = 0;
-            self.put_rel_truncation(modification, rel, num_vm_blocks)?;
        }
        Ok(())
    }
--- a/test_runner/regress/test_truncate.py
+++ b/test_runner/regress/test_truncate.py
@@ -0,0 +1,46 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+#
+# Test truncation of FSM and VM forks of a relation
+#
+def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark):
+
+    env = neon_env_builder.init_start()
+    n_records = 10000
+    n_iter = 10
+
+    # Problems with FSM/VM forks truncation are most frequently detected during page reconstruction triggered
+    # by image layer generation. So adjust default parameters to make it happen more frequently.
+    tenant, _ = env.neon_cli.create_tenant(
+        conf={
+            "gc_period": "100 m",
+            "gc_horizon": "1048576",
+            "checkpoint_distance": "1000000",
+            "compaction_period": "1 s",
+            "compaction_threshold": "3",
+            "image_creation_threshold": "1",
+            "compaction_target_size": "1000000",
+        }
+    )
+
+    env.neon_cli.create_timeline("test_truncate", tenant_id=tenant)
+    pg = env.postgres.create_start("test_truncate", tenant_id=tenant)
+    cur = pg.connect().cursor()
+    cur.execute("create table t1(x integer)")
+    cur.execute(f"insert into t1 values (generate_series(1,{n_records}))")
+    cur.execute("vacuum t1")
+    for i in range(n_iter):
+        cur.execute(f"delete from t1 where x>{n_records//2}")
+        cur.execute("vacuum t1")
+        time.sleep(1)  # let pageserver a chance to create image layers
+        cur.execute(f"insert into t1 values (generate_series({n_records//2+1}, {n_records}))")
+        cur.execute("vacuum t1")
+        time.sleep(1)  # let pageserver a chance to create image layers
+
+    cur.execute("select count(*) from t1")
+    res = cur.fetchone()
+    assert res is not None
+    assert res[0] == n_records