From f70611c8df719a45f23abffdbc5b60a803e4f87e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 14 Nov 2024 17:19:13 +0200 Subject: [PATCH] Correctly truncate VM (#9342) ## Problem https://github.com/neondatabase/neon/issues/9240 ## Summary of changes Correctly truncate VM page instead just replacing it with zero page. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- libs/pageserver_api/src/record.rs | 5 ++++ libs/postgres_ffi/src/pg_constants.rs | 7 ++++-- pageserver/src/walingest.rs | 28 +++++++++++++++++---- pageserver/src/walredo/apply_neon.rs | 28 +++++++++++++++++++++ test_runner/regress/test_vm_truncate.py | 33 +++++++++++++++++++++++++ 5 files changed, 94 insertions(+), 7 deletions(-) create mode 100644 test_runner/regress/test_vm_truncate.py diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index 5c3f3deb82..bb62b35d36 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -41,6 +41,11 @@ pub enum NeonWalRecord { file_path: String, content: Option, }, + // Truncate visibility map page + TruncateVisibilityMap { + trunc_byte: usize, + trunc_offs: usize, + }, /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. #[cfg(feature = "testing")] diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs index 497d011d7a..e343473d77 100644 --- a/libs/postgres_ffi/src/pg_constants.rs +++ b/libs/postgres_ffi/src/pg_constants.rs @@ -243,8 +243,11 @@ const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_P pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32; /* From visibilitymap.c */ -pub const VM_HEAPBLOCKS_PER_PAGE: u32 = - (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +pub const VM_MAPSIZE: usize = BLCKSZ as usize - MAXALIGN_SIZE_OF_PAGE_HEADER_DATA; +pub const VM_BITS_PER_HEAPBLOCK: usize = 2; +pub const VM_HEAPBLOCKS_PER_BYTE: usize = 8 / VM_BITS_PER_HEAPBLOCK; +pub const VM_HEAPBLOCKS_PER_PAGE: usize = VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE; /* From origin.c */ pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index c3ccd8a2e4..84e553f330 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -587,11 +587,29 @@ impl WalIngest { forknum: VISIBILITYMAP_FORKNUM, }; - let mut vm_page_no = blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE; - if blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { - // Tail of last remaining vm page has to be zeroed. - // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image_zero(rel, vm_page_no)?; + // last remaining block, byte, and bit + let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32); + let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE + / pg_constants::VM_HEAPBLOCKS_PER_BYTE; + let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE + * pg_constants::VM_BITS_PER_HEAPBLOCK; + + // Unless the new size is exactly at a visibility map page boundary, the + // tail bits in the last remaining map page, representing truncated heap + // blocks, need to be cleared. This is not only tidy, but also necessary + // because we don't get a chance to clear the bits if the heap is extended + // again. + if (trunc_byte != 0 || trunc_offs != 0) + && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no)) + { + modification.put_rel_wal_record( + rel, + vm_page_no, + NeonWalRecord::TruncateVisibilityMap { + trunc_byte, + trunc_offs, + }, + )?; vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 78601d87af..d62e325310 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -42,6 +42,34 @@ pub(crate) fn apply_in_neon( } => { anyhow::bail!("tried to pass postgres wal record to neon WAL redo"); } + // + // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c` + // + NeonWalRecord::TruncateVisibilityMap { + trunc_byte, + trunc_offs, + } => { + // sanity check that this is modifying the correct relation + let (rel, _) = key.to_rel_block().context("invalid record")?; + assert!( + rel.forknum == VISIBILITYMAP_FORKNUM, + "TruncateVisibilityMap record on unexpected rel {}", + rel + ); + let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..]; + map[*trunc_byte + 1..].fill(0u8); + /*---- + * Mask out the unwanted bits of the last remaining byte. + * + * ((1 << 0) - 1) = 00000000 + * ((1 << 1) - 1) = 00000001 + * ... + * ((1 << 6) - 1) = 00111111 + * ((1 << 7) - 1) = 01111111 + *---- + */ + map[*trunc_byte] &= (1 << *trunc_offs) - 1; + } NeonWalRecord::ClearVisibilityMapFlags { new_heap_blkno, old_heap_blkno, diff --git a/test_runner/regress/test_vm_truncate.py b/test_runner/regress/test_vm_truncate.py new file mode 100644 index 0000000000..43b4f2d8b1 --- /dev/null +++ b/test_runner/regress/test_vm_truncate.py @@ -0,0 +1,33 @@ +from fixtures.neon_fixtures import NeonEnv + + +# +# Test that VM is properly truncated +# +def test_vm_truncate(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + con = endpoint.connect() + cur = con.cursor() + cur.execute("CREATE EXTENSION neon_test_utils") + cur.execute("CREATE EXTENSION pageinspect") + + cur.execute( + "create table t(pk integer primary key, counter integer default 0, filler text default repeat('?', 200))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000))") + cur.execute("delete from t where pk>10") + cur.execute("vacuum t") # truncates the relation, including its VM and FSM + # get image of the first block of the VM excluding the page header. It's expected + # to still be in the buffer cache. + # ignore page header (24 bytes, 48 - it's hex representation) + cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") + pg_bitmap = cur.fetchall()[0][0] + # flush shared buffers + cur.execute("SELECT clear_buffer_cache()") + # now download the first block of the VM from the pageserver ... + cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)") + ps_bitmap = cur.fetchall()[0][0] + # and check that content of bitmaps are equal, i.e. PS is producing the same VM page as Postgres + assert pg_bitmap == ps_bitmap