mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 14:32:57 +00:00
Implement corrent truncation of FSM/VM forks on arbitrary position (#2609)
refer #2601 Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
This commit is contained in:
committed by
GitHub
parent
21ec28d9bc
commit
a6e4a3c3ef
@@ -163,6 +163,27 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
|
||||
pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
}
|
||||
|
||||
// This is port of function with the same name from freespace.c.
|
||||
// The only difference is that it does not have "level" parameter because XLogRecordPageWithFreeSpace
|
||||
// always call it with level=FSM_BOTTOM_LEVEL
|
||||
pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
|
||||
let mut leafno = addr;
|
||||
const FSM_TREE_DEPTH: u32 = if pg_constants::SLOTS_PER_FSM_PAGE >= 1626 {
|
||||
3
|
||||
} else {
|
||||
4
|
||||
};
|
||||
|
||||
/* Count upper level nodes required to address the leaf page */
|
||||
let mut pages: BlockNumber = 0;
|
||||
for _l in 0..FSM_TREE_DEPTH {
|
||||
pages += leafno + 1;
|
||||
leafno /= pg_constants::SLOTS_PER_FSM_PAGE;
|
||||
}
|
||||
/* Turn the page count into 0-based block number */
|
||||
pages - 1
|
||||
}
|
||||
|
||||
pub mod waldecoder {
|
||||
|
||||
use crate::{v14, v15};
|
||||
|
||||
@@ -197,6 +197,16 @@ pub const XLOG_CHECKPOINT_SHUTDOWN: u8 = 0x00;
|
||||
pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
/* From fsm_internals.h */
|
||||
const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
|
||||
const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
|
||||
const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_PER_PAGE;
|
||||
pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
|
||||
|
||||
/* From visibilitymap.c */
|
||||
pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
|
||||
(BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
|
||||
|
||||
// List of subdirectories inside pgdata.
|
||||
// Copied from src/bin/initdb/initdb.c
|
||||
pub const PGDATA_SUBDIRS: [&str; 22] = [
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
use anyhow::Context;
|
||||
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::{page_is_new, page_set_lsn};
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
@@ -612,20 +612,19 @@ impl<'a> WalIngest<'a> {
|
||||
forknum: FSM_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: 'blkno' stored in the WAL record is the new size of the
|
||||
// heap. The formula for calculating the new size of the FSM is
|
||||
// pretty complicated (see FreeSpaceMapPrepareTruncateRel() in
|
||||
// PostgreSQL), and we should also clear bits in the tail FSM block,
|
||||
// and update the upper level FSM pages. None of that has been
|
||||
// implemented. What we do instead, is always just truncate the FSM
|
||||
// to zero blocks. That's bad for performance, but safe. (The FSM
|
||||
// isn't needed for correctness, so we could also leave garbage in
|
||||
// it. Seems more tidy to zap it away.)
|
||||
if rec.blkno != 0 {
|
||||
info!("Partial truncation of FSM is not supported");
|
||||
let fsm_logical_page_no = rec.blkno / pg_constants::SLOTS_PER_FSM_PAGE;
|
||||
let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
|
||||
if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
|
||||
// Tail of last remaining FSM page has to be zeroed.
|
||||
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
|
||||
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
|
||||
fsm_physical_page_no += 1;
|
||||
}
|
||||
let nblocks = self.get_relsize(rel, modification.lsn)?;
|
||||
if nblocks > fsm_physical_page_no {
|
||||
// check if something to do: FSM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
|
||||
}
|
||||
let num_fsm_blocks = 0;
|
||||
self.put_rel_truncation(modification, rel, num_fsm_blocks)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
|
||||
let rel = RelTag {
|
||||
@@ -635,16 +634,18 @@ impl<'a> WalIngest<'a> {
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: Like with the FSM above, the logic to truncate the VM
|
||||
// correctly has not been implemented. Just zap it away completely,
|
||||
// always. Unlike the FSM, the VM must never have bits incorrectly
|
||||
// set. From a correctness point of view, it's always OK to clear
|
||||
// bits or remove it altogether, though.
|
||||
if rec.blkno != 0 {
|
||||
info!("Partial truncation of VM is not supported");
|
||||
let mut vm_page_no = rec.blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
|
||||
if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
|
||||
// Tail of last remaining vm page has to be zeroed.
|
||||
// We are not precise here and instead of digging in VM bitmap format just clear the whole page.
|
||||
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
|
||||
vm_page_no += 1;
|
||||
}
|
||||
let nblocks = self.get_relsize(rel, modification.lsn)?;
|
||||
if nblocks > vm_page_no {
|
||||
// check if something to do: VM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, vm_page_no)?;
|
||||
}
|
||||
let num_vm_blocks = 0;
|
||||
self.put_rel_truncation(modification, rel, num_vm_blocks)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
46
test_runner/regress/test_truncate.py
Normal file
46
test_runner/regress/test_truncate.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
#
|
||||
# Test truncation of FSM and VM forks of a relation
|
||||
#
|
||||
def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark):
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
n_records = 10000
|
||||
n_iter = 10
|
||||
|
||||
# Problems with FSM/VM forks truncation are most frequently detected during page reconstruction triggered
|
||||
# by image layer generation. So adjust default parameters to make it happen more frequently.
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
"gc_period": "100 m",
|
||||
"gc_horizon": "1048576",
|
||||
"checkpoint_distance": "1000000",
|
||||
"compaction_period": "1 s",
|
||||
"compaction_threshold": "3",
|
||||
"image_creation_threshold": "1",
|
||||
"compaction_target_size": "1000000",
|
||||
}
|
||||
)
|
||||
|
||||
env.neon_cli.create_timeline("test_truncate", tenant_id=tenant)
|
||||
pg = env.postgres.create_start("test_truncate", tenant_id=tenant)
|
||||
cur = pg.connect().cursor()
|
||||
cur.execute("create table t1(x integer)")
|
||||
cur.execute(f"insert into t1 values (generate_series(1,{n_records}))")
|
||||
cur.execute("vacuum t1")
|
||||
for i in range(n_iter):
|
||||
cur.execute(f"delete from t1 where x>{n_records//2}")
|
||||
cur.execute("vacuum t1")
|
||||
time.sleep(1) # let pageserver a chance to create image layers
|
||||
cur.execute(f"insert into t1 values (generate_series({n_records//2+1}, {n_records}))")
|
||||
cur.execute("vacuum t1")
|
||||
time.sleep(1) # let pageserver a chance to create image layers
|
||||
|
||||
cur.execute("select count(*) from t1")
|
||||
res = cur.fetchone()
|
||||
assert res is not None
|
||||
assert res[0] == n_records
|
||||
Reference in New Issue
Block a user