Avoid having multiple records for the same page and LSN.

If a heap UPDATE record modified two pages, and both pages needed to have
their VM bits cleared, and the VM bits were located on the same VM page,
we would emit two ZenithWalRecord::ClearVisibilityMapFlags records for
the same VM page. That produced warnings like this in the pageserver log:

    Page version Wal(ClearVisibilityMapFlags { heap_blkno: 18, flags: 3 }) of rel 1663/13949/2619_vm blk 0 at 2A/346046A0 already exists

To fix, change ClearVisibilityMapFlags so that it can update the bits
for both pages as one operation.

This was already covered by several python tests, so no need to add a
new one. Fixes #1125.

Co-authored-by: Konstantin Knizhnik <knizhnik@zenith.tech>
This commit is contained in:
Heikki Linnakangas
2022-02-15 14:26:16 +02:00
parent 328e3b4189
commit 9632c352ab
3 changed files with 105 additions and 95 deletions

View File

@@ -306,8 +306,12 @@ pub enum ZenithWalRecord {
/// Native PostgreSQL WAL record
Postgres { will_init: bool, rec: Bytes },
/// Set bits in heap visibility map. (heap blkno, flag bits to clear)
ClearVisibilityMapFlags { heap_blkno: u32, flags: u8 },
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
ClearVisibilityMapFlags {
new_heap_blkno: Option<u32>,
old_heap_blkno: Option<u32>,
flags: u8,
},
/// Mark transaction IDs as committed on a CLOG page
ClogSetCommitted { xids: Vec<TransactionId> },
/// Mark transaction IDs as aborted on a CLOG page

View File

@@ -349,49 +349,25 @@ impl WalIngest {
decoded: &mut DecodedWALRecord,
) -> Result<()> {
// Handle VM bit updates that are implicitly part of heap records.
// First, look at the record to determine which VM bits need
// to be cleared. If either of these variables is set, we
// need to clear the corresponding bits in the visibility map.
let mut new_heap_blkno: Option<u32> = None;
let mut old_heap_blkno: Option<u32> = None;
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
if info == pg_constants::XLOG_HEAP_INSERT {
let xlrec = XlHeapInsert::decode(buf);
assert_eq!(0, buf.remaining());
if (xlrec.flags
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
!= 0
{
timeline.put_wal_record(
lsn,
RelishTag::Relation(RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: decoded.blocks[0].rnode_spcnode,
dbnode: decoded.blocks[0].rnode_dbnode,
relnode: decoded.blocks[0].rnode_relnode,
}),
decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
ZenithWalRecord::ClearVisibilityMapFlags {
heap_blkno: decoded.blocks[0].blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP_DELETE {
let xlrec = XlHeapDelete::decode(buf);
assert_eq!(0, buf.remaining());
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
timeline.put_wal_record(
lsn,
RelishTag::Relation(RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: decoded.blocks[0].rnode_spcnode,
dbnode: decoded.blocks[0].rnode_dbnode,
relnode: decoded.blocks[0].rnode_relnode,
}),
decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
ZenithWalRecord::ClearVisibilityMapFlags {
heap_blkno: decoded.blocks[0].blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP_UPDATE
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
@@ -400,39 +376,15 @@ impl WalIngest {
// the size of tuple data is inferred from the size of the record.
// we can't validate the remaining number of bytes without parsing
// the tuple data.
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
timeline.put_wal_record(
lsn,
RelishTag::Relation(RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: decoded.blocks[0].rnode_spcnode,
dbnode: decoded.blocks[0].rnode_dbnode,
relnode: decoded.blocks[0].rnode_relnode,
}),
decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
ZenithWalRecord::ClearVisibilityMapFlags {
heap_blkno: decoded.blocks[0].blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
}
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
&& decoded.blocks.len() > 1
{
timeline.put_wal_record(
lsn,
RelishTag::Relation(RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: decoded.blocks[1].rnode_spcnode,
dbnode: decoded.blocks[1].rnode_dbnode,
relnode: decoded.blocks[1].rnode_relnode,
}),
decoded.blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
ZenithWalRecord::ClearVisibilityMapFlags {
heap_blkno: decoded.blocks[1].blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
// non-HOT update where the new tuple goes to different page than
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
// set.
new_heap_blkno = Some(decoded.blocks[1].blkno);
}
}
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -448,23 +400,60 @@ impl WalIngest {
};
assert_eq!(offset_array_len, buf.remaining());
// FIXME: why also ALL_FROZEN_SET?
if (xlrec.flags
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
!= 0
{
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
}
}
// FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?
// Clear the VM bits if required.
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
let vm_relish = RelishTag::Relation(RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: decoded.blocks[0].rnode_spcnode,
dbnode: decoded.blocks[0].rnode_dbnode,
relnode: decoded.blocks[0].rnode_relnode,
});
let new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
let old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
if new_vm_blk == old_vm_blk {
// An UPDATE record that needs to clear the bits for both old and the
// new page, both of which reside on the same VM page.
timeline.put_wal_record(
lsn,
vm_relish,
new_vm_blk.unwrap(),
ZenithWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
} else {
// Clear VM bits for one heap page, or for two pages that reside on
// different VM pages.
if let Some(new_vm_blk) = new_vm_blk {
timeline.put_wal_record(
lsn,
RelishTag::Relation(RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: decoded.blocks[0].rnode_spcnode,
dbnode: decoded.blocks[0].rnode_dbnode,
relnode: decoded.blocks[0].rnode_relnode,
}),
decoded.blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
vm_relish,
new_vm_blk,
ZenithWalRecord::ClearVisibilityMapFlags {
heap_blkno: decoded.blocks[0].blkno,
new_heap_blkno,
old_heap_blkno: None,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
}
if let Some(old_vm_blk) = old_vm_blk {
timeline.put_wal_record(
lsn,
vm_relish,
old_vm_blk,
ZenithWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: None,
old_heap_blkno,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
)?;
@@ -472,8 +461,6 @@ impl WalIngest {
}
}
// FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?
Ok(())
}

View File

@@ -363,25 +363,44 @@ impl PostgresRedoManager {
will_init: _,
rec: _,
} => panic!("tried to pass postgres wal record to zenith WAL redo"),
ZenithWalRecord::ClearVisibilityMapFlags { heap_blkno, flags } => {
// Calculate the VM block and offset that corresponds to the heap block.
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(*heap_blkno);
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(*heap_blkno);
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(*heap_blkno);
// Check that we're modifying the correct VM block.
ZenithWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno,
flags,
} => {
// sanity check that this is modifying the correct relish
assert!(
check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM),
"ClearVisibilityMapFlags record on unexpected rel {:?}",
rel
);
assert!(map_block == blknum);
if let Some(heap_blkno) = *new_heap_blkno {
// Calculate the VM block and offset that corresponds to the heap block.
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
// equivalent to PageGetContents(page)
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
// Check that we're modifying the correct VM block.
assert!(map_block == blknum);
let mask: u8 = flags << map_offset;
map[map_byte as usize] &= !mask;
// equivalent to PageGetContents(page)
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
map[map_byte as usize] &= !(flags << map_offset);
}
// Repeat for 'old_heap_blkno', if any
if let Some(heap_blkno) = *old_heap_blkno {
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
assert!(map_block == blknum);
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
map[map_byte as usize] &= !(flags << map_offset);
}
}
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.