Implement corrent truncation of FSM/VM forks on arbitrary position (#2609)

refer #2601

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
This commit is contained in:
Konstantin Knizhnik
2022-11-23 18:46:07 +02:00
committed by GitHub
parent 21ec28d9bc
commit a6e4a3c3ef
4 changed files with 101 additions and 23 deletions

View File

@@ -163,6 +163,27 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
}
// This is port of function with the same name from freespace.c.
// The only difference is that it does not have "level" parameter because XLogRecordPageWithFreeSpace
// always call it with level=FSM_BOTTOM_LEVEL
pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
let mut leafno = addr;
const FSM_TREE_DEPTH: u32 = if pg_constants::SLOTS_PER_FSM_PAGE >= 1626 {
3
} else {
4
};
/* Count upper level nodes required to address the leaf page */
let mut pages: BlockNumber = 0;
for _l in 0..FSM_TREE_DEPTH {
pages += leafno + 1;
leafno /= pg_constants::SLOTS_PER_FSM_PAGE;
}
/* Turn the page count into 0-based block number */
pages - 1
}
pub mod waldecoder {
use crate::{v14, v15};

View File

@@ -197,6 +197,16 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
pub const XLP_LONG_HEADER: u16 = 0x0002;
/* From fsm_internals.h */
const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_PER_PAGE;
pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
/* From visibilitymap.c */
pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
(BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&str; 22] = [

View File

@@ -24,7 +24,7 @@
use anyhow::Context;
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
use postgres_ffi::{page_is_new, page_set_lsn};
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use anyhow::Result;
use bytes::{Buf, Bytes, BytesMut};
@@ -612,20 +612,19 @@ impl<'a> WalIngest<'a> {
forknum: FSM_FORKNUM,
};
// FIXME: 'blkno' stored in the WAL record is the new size of the
// heap. The formula for calculating the new size of the FSM is
// pretty complicated (see FreeSpaceMapPrepareTruncateRel() in
// PostgreSQL), and we should also clear bits in the tail FSM block,
// and update the upper level FSM pages. None of that has been
// implemented. What we do instead, is always just truncate the FSM
// to zero blocks. That's bad for performance, but safe. (The FSM
// isn't needed for correctness, so we could also leave garbage in
// it. Seems more tidy to zap it away.)
if rec.blkno != 0 {
info!("Partial truncation of FSM is not supported");
let fsm_logical_page_no = rec.blkno / pg_constants::SLOTS_PER_FSM_PAGE;
let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
// Tail of last remaining FSM page has to be zeroed.
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
fsm_physical_page_no += 1;
}
let nblocks = self.get_relsize(rel, modification.lsn)?;
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
}
let num_fsm_blocks = 0;
self.put_rel_truncation(modification, rel, num_fsm_blocks)?;
}
if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
let rel = RelTag {
@@ -635,16 +634,18 @@ impl<'a> WalIngest<'a> {
forknum: VISIBILITYMAP_FORKNUM,
};
// FIXME: Like with the FSM above, the logic to truncate the VM
// correctly has not been implemented. Just zap it away completely,
// always. Unlike the FSM, the VM must never have bits incorrectly
// set. From a correctness point of view, it's always OK to clear
// bits or remove it altogether, though.
if rec.blkno != 0 {
info!("Partial truncation of VM is not supported");
let mut vm_page_no = rec.blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
// Tail of last remaining vm page has to be zeroed.
// We are not precise here and instead of digging in VM bitmap format just clear the whole page.
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
vm_page_no += 1;
}
let nblocks = self.get_relsize(rel, modification.lsn)?;
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no)?;
}
let num_vm_blocks = 0;
self.put_rel_truncation(modification, rel, num_vm_blocks)?;
}
Ok(())
}

View File

@@ -0,0 +1,46 @@
import time
from fixtures.neon_fixtures import NeonEnvBuilder
#
# Test truncation of FSM and VM forks of a relation
#
def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark):
env = neon_env_builder.init_start()
n_records = 10000
n_iter = 10
# Problems with FSM/VM forks truncation are most frequently detected during page reconstruction triggered
# by image layer generation. So adjust default parameters to make it happen more frequently.
tenant, _ = env.neon_cli.create_tenant(
conf={
"gc_period": "100 m",
"gc_horizon": "1048576",
"checkpoint_distance": "1000000",
"compaction_period": "1 s",
"compaction_threshold": "3",
"image_creation_threshold": "1",
"compaction_target_size": "1000000",
}
)
env.neon_cli.create_timeline("test_truncate", tenant_id=tenant)
pg = env.postgres.create_start("test_truncate", tenant_id=tenant)
cur = pg.connect().cursor()
cur.execute("create table t1(x integer)")
cur.execute(f"insert into t1 values (generate_series(1,{n_records}))")
cur.execute("vacuum t1")
for i in range(n_iter):
cur.execute(f"delete from t1 where x>{n_records//2}")
cur.execute("vacuum t1")
time.sleep(1) # let pageserver a chance to create image layers
cur.execute(f"insert into t1 values (generate_series({n_records//2+1}, {n_records}))")
cur.execute("vacuum t1")
time.sleep(1) # let pageserver a chance to create image layers
cur.execute("select count(*) from t1")
res = cur.fetchone()
assert res is not None
assert res[0] == n_records