From 59ea3973a4de399cb4edc6a5fba32cb9e9161825 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 10 Sep 2021 18:27:34 +0300 Subject: [PATCH] Set hint bits in pageserver --- pageserver/src/layered_repository.rs | 1 + pageserver/src/repository.rs | 1 + pageserver/src/walredo.rs | 123 ++++++++++++++++++++++++++- postgres_ffi/src/pg_constants.rs | 26 ++++++ 4 files changed, 150 insertions(+), 1 deletion(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index e9db64fdfb..67ae3bc4f3 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1446,6 +1446,7 @@ impl LayeredTimeline { trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn); } let img = self.walredo_mgr.request_redo( + self, rel, blknum, request_lsn, diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 293cf03550..8003350813 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -507,6 +507,7 @@ mod tests { impl WalRedoManager for TestRedoManager { fn request_redo( &self, + timeline: &dyn Timeline, rel: RelishTag, blknum: u32, lsn: Lsn, diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index f1399c4eec..6020e94bd2 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -43,7 +43,7 @@ use zenith_utils::lsn::Lsn; use zenith_utils::zid::ZTenantId; use crate::relish::*; -use crate::repository::WALRecord; +use crate::repository::{Timeline, WALRecord}; use crate::waldecoder::XlMultiXactCreate; use crate::waldecoder::XlXactParsedRecord; use crate::PageServerConf; @@ -79,6 +79,7 @@ pub trait WalRedoManager: Send + Sync { /// the reords. fn request_redo( &self, + timeline: &dyn Timeline, rel: RelishTag, blknum: u32, lsn: Lsn, @@ -96,6 +97,7 @@ pub struct DummyRedoManager {} impl crate::walredo::WalRedoManager for DummyRedoManager { fn request_redo( &self, + _timeline: &dyn Timeline, _rel: RelishTag, _blknum: u32, _lsn: Lsn, @@ -176,6 +178,7 @@ impl WalRedoManager for PostgresRedoManager { /// fn request_redo( &self, + timeline: &dyn Timeline, rel: RelishTag, blknum: u32, lsn: Lsn, @@ -216,6 +219,13 @@ impl WalRedoManager for PostgresRedoManager { WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64()); + if let Ok(page) = result { + let mut buf = BytesMut::new(); + buf.extend_from_slice(&page); + self.set_hint_bits(timeline, &mut buf, lsn, &request.records); + return Ok(buf.freeze()); + } + result } } @@ -242,6 +252,117 @@ impl PostgresRedoManager { } } + fn xid_status(&self, timeline: &dyn Timeline, xid: u32, lsn: Lsn) -> u8 { + let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE; + let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; + let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; + if let Ok(clog_page) = timeline.get_page_at_lsn_nowait( + RelishTag::Slru { + slru: SlruKind::Clog, + segno, + }, + rpageno, + lsn, + ) { + postgres_ffi::nonrelfile_utils::transaction_id_get_status(xid, &clog_page[..]) + } else { + pg_constants::TRANSACTION_STATUS_IN_PROGRESS + } + } + + fn set_hint_bits( + &self, + timeline: &dyn Timeline, + page: &mut BytesMut, + lsn: Lsn, + records: &Vec, + ) { + let mut flags = LittleEndian::read_u16( + &page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2], + ); + if (flags & (pg_constants::PD_HEAP_RELATION | pg_constants::PD_NONHEAP_RELATION)) == 0 { + // If type of relation was not determined yet, + // then do it now + for r in records { + let xl_rmid = r.rec[pg_constants::XL_RMID_OFFS]; + if xl_rmid == pg_constants::RM_HEAP_ID || xl_rmid == pg_constants::RM_HEAP2_ID { + flags |= pg_constants::PD_HEAP_RELATION; + break; + } + } + if (flags & pg_constants::PD_HEAP_RELATION) == 0 { + flags |= pg_constants::PD_NONHEAP_RELATION; + } + LittleEndian::write_u16( + &mut page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2], + flags, + ); + } + if (flags & pg_constants::PD_HEAP_RELATION) != 0 { + // Set hint bits for heap relation page + let pd_lower = LittleEndian::read_u16( + &page[pg_constants::PD_LOWER_OFFSET..pg_constants::PD_LOWER_OFFSET + 2], + ) as usize; + let mut tid_offs = pg_constants::SIZE_OF_PAGE_HEADER_DATA; + while tid_offs < pd_lower { + let tid = LittleEndian::read_u32(&page[tid_offs..tid_offs + 4]); + let lp_off = (tid & 0x7FFF) as usize; + if ((tid >> 15) & 3) == pg_constants::LP_NORMAL { + // normal item pointer + let t_xmin = LittleEndian::read_u32( + &page[lp_off + pg_constants::T_XMIN_OFFS + ..lp_off + pg_constants::T_XMIN_OFFS + 4], + ); + let t_xmax = LittleEndian::read_u32( + &page[lp_off + pg_constants::T_XMAX_OFFS + ..lp_off + pg_constants::T_XMAX_OFFS + 4], + ); + let mut t_infomask = LittleEndian::read_u16( + &page[lp_off + pg_constants::T_INFOMASK_OFFS + ..lp_off + pg_constants::T_INFOMASK_OFFS + 2], + ); + if (t_infomask + & (pg_constants::HEAP_XMIN_COMMITTED | pg_constants::HEAP_XMIN_INVALID)) + == 0 + && t_xmin != 0 + { + let status = self.xid_status(timeline, t_xmin, lsn); + if status == pg_constants::TRANSACTION_STATUS_COMMITTED { + t_infomask |= pg_constants::HEAP_XMIN_COMMITTED; + } else if status == pg_constants::TRANSACTION_STATUS_ABORTED { + t_infomask |= pg_constants::HEAP_XMIN_INVALID; + } + LittleEndian::write_u16( + &mut page[lp_off + pg_constants::T_INFOMASK_OFFS + ..lp_off + pg_constants::T_INFOMASK_OFFS + 2], + t_infomask, + ); + } + if (t_infomask + & (pg_constants::HEAP_XMAX_COMMITTED + | pg_constants::HEAP_XMAX_INVALID + | pg_constants::HEAP_XMAX_IS_MULTI)) + == 0 + && t_xmax != 0 + { + let status = self.xid_status(timeline, t_xmax, lsn); + if status == pg_constants::TRANSACTION_STATUS_COMMITTED { + t_infomask |= pg_constants::HEAP_XMAX_COMMITTED; + } else if status == pg_constants::TRANSACTION_STATUS_ABORTED { + t_infomask |= pg_constants::HEAP_XMAX_INVALID; + } + LittleEndian::write_u16( + &mut page[lp_off + pg_constants::T_INFOMASK_OFFS + ..lp_off + pg_constants::T_INFOMASK_OFFS + 2], + t_infomask, + ); + } + } + tid_offs += 4; + } + } + } + /// /// Process one request for WAL redo. /// diff --git a/postgres_ffi/src/pg_constants.rs b/postgres_ffi/src/pg_constants.rs index 5558b280f0..b80222c738 100644 --- a/postgres_ffi/src/pg_constants.rs +++ b/postgres_ffi/src/pg_constants.rs @@ -46,6 +46,7 @@ pub const SIZE_OF_PAGE_HEADER: u16 = 24; pub const BITS_PER_HEAPBLOCK: u16 = 2; pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK; +pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00; pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01; pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02; pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03; @@ -191,6 +192,31 @@ pub const XLP_LONG_HEADER: u16 = 0x0002; pub const PG_MAJORVERSION: &str = "14"; +// Zenith specific page flags used to distinguish heap and non-heap relations +pub const PD_HEAP_RELATION: u16 = 0x10; +pub const PD_NONHEAP_RELATION: u16 = 0x20; + +// bufpage.h +pub const PD_FLAGS_OFFSET: usize = 10; // PageHeaderData.pd_flags +pub const PD_LOWER_OFFSET: usize = 12; // PageHeaderData.pd_lower + +// itemid.h +pub const LP_NORMAL: u32 = 1; + +// htup_details.h +pub const T_XMIN_OFFS: usize = 0; +pub const T_XMAX_OFFS: usize = 4; +pub const T_INFOMASK_OFFS: usize = 4 * 3 + 2 * 3 + 2; +pub const HEAP_XMIN_COMMITTED: u16 = 0x0100; /* t_xmin committed */ +pub const HEAP_XMIN_INVALID: u16 = 0x0200; /* t_xmin invalid/aborted */ +pub const HEAP_XMAX_COMMITTED: u16 = 0x0400; /* t_xmax committed */ +pub const HEAP_XMAX_INVALID: u16 = 0x0800; /* t_xmax invalid/aborted */ +pub const HEAP_XMAX_IS_MULTI: u16 = 0x1000; /* t_xmax is a MultiXactId */ +pub const SIZE_OF_PAGE_HEADER_DATA: usize = 24; + +// xlogrecord.h +pub const XL_RMID_OFFS: usize = 17; + // List of subdirectories inside pgdata. // Copied from src/bin/initdb/initdb.c pub const PGDATA_SUBDIRS: [&str; 22] = [