Set hint bits in pageserver

This commit is contained in:
Konstantin Knizhnik
2021-09-10 18:27:34 +03:00
parent 08bc808043
commit 59ea3973a4
4 changed files with 150 additions and 1 deletions

View File

@@ -1446,6 +1446,7 @@ impl LayeredTimeline {
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn);
}
let img = self.walredo_mgr.request_redo(
self,
rel,
blknum,
request_lsn,

View File

@@ -507,6 +507,7 @@ mod tests {
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
timeline: &dyn Timeline,
rel: RelishTag,
blknum: u32,
lsn: Lsn,

View File

@@ -43,7 +43,7 @@ use zenith_utils::lsn::Lsn;
use zenith_utils::zid::ZTenantId;
use crate::relish::*;
use crate::repository::WALRecord;
use crate::repository::{Timeline, WALRecord};
use crate::waldecoder::XlMultiXactCreate;
use crate::waldecoder::XlXactParsedRecord;
use crate::PageServerConf;
@@ -79,6 +79,7 @@ pub trait WalRedoManager: Send + Sync {
/// the reords.
fn request_redo(
&self,
timeline: &dyn Timeline,
rel: RelishTag,
blknum: u32,
lsn: Lsn,
@@ -96,6 +97,7 @@ pub struct DummyRedoManager {}
impl crate::walredo::WalRedoManager for DummyRedoManager {
fn request_redo(
&self,
_timeline: &dyn Timeline,
_rel: RelishTag,
_blknum: u32,
_lsn: Lsn,
@@ -176,6 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
///
fn request_redo(
&self,
timeline: &dyn Timeline,
rel: RelishTag,
blknum: u32,
lsn: Lsn,
@@ -216,6 +219,13 @@ impl WalRedoManager for PostgresRedoManager {
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
if let Ok(page) = result {
let mut buf = BytesMut::new();
buf.extend_from_slice(&page);
self.set_hint_bits(timeline, &mut buf, lsn, &request.records);
return Ok(buf.freeze());
}
result
}
}
@@ -242,6 +252,117 @@ impl PostgresRedoManager {
}
}
fn xid_status(&self, timeline: &dyn Timeline, xid: u32, lsn: Lsn) -> u8 {
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
if let Ok(clog_page) = timeline.get_page_at_lsn_nowait(
RelishTag::Slru {
slru: SlruKind::Clog,
segno,
},
rpageno,
lsn,
) {
postgres_ffi::nonrelfile_utils::transaction_id_get_status(xid, &clog_page[..])
} else {
pg_constants::TRANSACTION_STATUS_IN_PROGRESS
}
}
fn set_hint_bits(
&self,
timeline: &dyn Timeline,
page: &mut BytesMut,
lsn: Lsn,
records: &Vec<WALRecord>,
) {
let mut flags = LittleEndian::read_u16(
&page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
);
if (flags & (pg_constants::PD_HEAP_RELATION | pg_constants::PD_NONHEAP_RELATION)) == 0 {
// If type of relation was not determined yet,
// then do it now
for r in records {
let xl_rmid = r.rec[pg_constants::XL_RMID_OFFS];
if xl_rmid == pg_constants::RM_HEAP_ID || xl_rmid == pg_constants::RM_HEAP2_ID {
flags |= pg_constants::PD_HEAP_RELATION;
break;
}
}
if (flags & pg_constants::PD_HEAP_RELATION) == 0 {
flags |= pg_constants::PD_NONHEAP_RELATION;
}
LittleEndian::write_u16(
&mut page[pg_constants::PD_FLAGS_OFFSET..pg_constants::PD_FLAGS_OFFSET + 2],
flags,
);
}
if (flags & pg_constants::PD_HEAP_RELATION) != 0 {
// Set hint bits for heap relation page
let pd_lower = LittleEndian::read_u16(
&page[pg_constants::PD_LOWER_OFFSET..pg_constants::PD_LOWER_OFFSET + 2],
) as usize;
let mut tid_offs = pg_constants::SIZE_OF_PAGE_HEADER_DATA;
while tid_offs < pd_lower {
let tid = LittleEndian::read_u32(&page[tid_offs..tid_offs + 4]);
let lp_off = (tid & 0x7FFF) as usize;
if ((tid >> 15) & 3) == pg_constants::LP_NORMAL {
// normal item pointer
let t_xmin = LittleEndian::read_u32(
&page[lp_off + pg_constants::T_XMIN_OFFS
..lp_off + pg_constants::T_XMIN_OFFS + 4],
);
let t_xmax = LittleEndian::read_u32(
&page[lp_off + pg_constants::T_XMAX_OFFS
..lp_off + pg_constants::T_XMAX_OFFS + 4],
);
let mut t_infomask = LittleEndian::read_u16(
&page[lp_off + pg_constants::T_INFOMASK_OFFS
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
);
if (t_infomask
& (pg_constants::HEAP_XMIN_COMMITTED | pg_constants::HEAP_XMIN_INVALID))
== 0
&& t_xmin != 0
{
let status = self.xid_status(timeline, t_xmin, lsn);
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
t_infomask |= pg_constants::HEAP_XMIN_COMMITTED;
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
t_infomask |= pg_constants::HEAP_XMIN_INVALID;
}
LittleEndian::write_u16(
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
t_infomask,
);
}
if (t_infomask
& (pg_constants::HEAP_XMAX_COMMITTED
| pg_constants::HEAP_XMAX_INVALID
| pg_constants::HEAP_XMAX_IS_MULTI))
== 0
&& t_xmax != 0
{
let status = self.xid_status(timeline, t_xmax, lsn);
if status == pg_constants::TRANSACTION_STATUS_COMMITTED {
t_infomask |= pg_constants::HEAP_XMAX_COMMITTED;
} else if status == pg_constants::TRANSACTION_STATUS_ABORTED {
t_infomask |= pg_constants::HEAP_XMAX_INVALID;
}
LittleEndian::write_u16(
&mut page[lp_off + pg_constants::T_INFOMASK_OFFS
..lp_off + pg_constants::T_INFOMASK_OFFS + 2],
t_infomask,
);
}
}
tid_offs += 4;
}
}
}
///
/// Process one request for WAL redo.
///

View File

@@ -46,6 +46,7 @@ pub const SIZE_OF_PAGE_HEADER: u16 = 24;
pub const BITS_PER_HEAPBLOCK: u16 = 2;
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
@@ -191,6 +192,31 @@ pub const XLP_LONG_HEADER: u16 = 0x0002;
pub const PG_MAJORVERSION: &str = "14";
// Zenith specific page flags used to distinguish heap and non-heap relations
pub const PD_HEAP_RELATION: u16 = 0x10;
pub const PD_NONHEAP_RELATION: u16 = 0x20;
// bufpage.h
pub const PD_FLAGS_OFFSET: usize = 10; // PageHeaderData.pd_flags
pub const PD_LOWER_OFFSET: usize = 12; // PageHeaderData.pd_lower
// itemid.h
pub const LP_NORMAL: u32 = 1;
// htup_details.h
pub const T_XMIN_OFFS: usize = 0;
pub const T_XMAX_OFFS: usize = 4;
pub const T_INFOMASK_OFFS: usize = 4 * 3 + 2 * 3 + 2;
pub const HEAP_XMIN_COMMITTED: u16 = 0x0100; /* t_xmin committed */
pub const HEAP_XMIN_INVALID: u16 = 0x0200; /* t_xmin invalid/aborted */
pub const HEAP_XMAX_COMMITTED: u16 = 0x0400; /* t_xmax committed */
pub const HEAP_XMAX_INVALID: u16 = 0x0800; /* t_xmax invalid/aborted */
pub const HEAP_XMAX_IS_MULTI: u16 = 0x1000; /* t_xmax is a MultiXactId */
pub const SIZE_OF_PAGE_HEADER_DATA: usize = 24;
// xlogrecord.h
pub const XL_RMID_OFFS: usize = 17;
// List of subdirectories inside pgdata.
// Copied from src/bin/initdb/initdb.c
pub const PGDATA_SUBDIRS: [&str; 22] = [