diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 6a3679292e..852670af2c 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,9 +1,11 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; +use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; -use std::fmt; +use std::{fmt, ops::Range}; -use crate::reltag::{BlockNumber, RelTag}; +use crate::reltag::{BlockNumber, RelTag, SlruKind}; /// Key used in the Repository kv-store. /// @@ -143,12 +145,390 @@ impl Key { } } +// Layout of the Key address space +// +// The Key struct, used to address the underlying key-value store, consists of +// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map +// all the data and metadata keys into those 18 bytes. +// +// Principles for the mapping: +// +// - Things that are often accessed or modified together, should be close to +// each other in the key space. For example, if a relation is extended by one +// block, we create a new key-value pair for the block data, and update the +// relation size entry. Because of that, the RelSize key comes after all the +// RelBlocks of a relation: the RelSize and the last RelBlock are always next +// to each other. +// +// The key space is divided into four major sections, identified by the first +// byte, and the form a hierarchy: +// +// 00 Relation data and metadata +// +// DbDir () -> (dbnode, spcnode) +// Filenodemap +// RelDir -> relnode forknum +// RelBlocks +// RelSize +// +// 01 SLRUs +// +// SlruDir kind +// SlruSegBlocks segno +// SlruSegSize +// +// 02 pg_twophase +// +// 03 misc +// Controlfile +// checkpoint +// pg_version +// +// 04 aux files +// +// Below is a full list of the keyspace allocation: +// +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 +// +// Filenodemap: +// 00 SPCNODE DBNODE 00000000 00 00000000 +// +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) +// +// RelBlock: +// 00 SPCNODE DBNODE RELNODE FORK BLKNUM +// +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +// +// SlruDir: +// 01 kind 00000000 00000000 00 00000000 +// +// SlruSegBlock: +// 01 kind 00000001 SEGNO 00 BLKNUM +// +// SlruSegSize: +// 01 kind 00000001 SEGNO 00 FFFFFFFF +// +// TwoPhaseDir: +// 02 00000000 00000000 00000000 00 00000000 +// +// TwoPhaseFile: +// 02 00000000 00000000 00000000 00 XID +// +// ControlFile: +// 03 00000000 00000000 00000000 00 00000000 +// +// Checkpoint: +// 03 00000000 00000000 00000000 00 00000001 +// +// AuxFiles: +// 03 00000000 00000000 00000000 00 00000002 +// + +//-- Section 01: relation data and metadata + +pub const DBDIR_KEY: Key = Key { + field1: 0x00, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +#[inline(always)] +pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +#[inline(always)] +pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 1, + } +} + +#[inline(always)] +pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +#[inline(always)] +pub fn rel_size_to_key(rel: RelTag) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn rel_key_range(rel: RelTag) -> Range { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0, + }..Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum + 1, + field6: 0, + } +} + +//-- Section 02: SLRUs + +#[inline(always)] +pub fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +#[inline(always)] +pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +#[inline(always)] +pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: 0xffffffff, + } +} + +#[inline(always)] +pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { + let field2 = match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }; + + Key { + field1: 0x01, + field2, + field3: 1, + field4: segno, + field5: 0, + field6: 0, + }..Key { + field1: 0x01, + field2, + field3: 1, + field4: segno, + field5: 1, + field6: 0, + } +} + +//-- Section 03: pg_twophase + +pub const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +#[inline(always)] +pub fn twophase_file_key(xid: TransactionId) -> Key { + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + } +} + +#[inline(always)] +pub fn twophase_key_range(xid: TransactionId) -> Range { + let (next_xid, overflowed) = xid.overflowing_add(1); + + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + }..Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: u8::from(overflowed), + field6: next_xid, + } +} + +//-- Section 03: Control file +pub const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +pub const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +pub const AUX_FILES_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 2, +}; + +// Reverse mappings for a few Keys. +// These are needed by WAL redo manager. + +// AUX_FILES currently stores only data for logical replication (slots etc), and +// we don't preserve these on a branch because safekeepers can't follow timeline +// switch (and generally it likely should be optional), so ignore these. +#[inline(always)] +pub fn is_inherited_key(key: Key) -> bool { + key != AUX_FILES_KEY +} + +#[inline(always)] +pub fn is_rel_fsm_block_key(key: Key) -> bool { + key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff +} + +#[inline(always)] +pub fn is_rel_vm_block_key(key: Key) -> bool { + key.field1 == 0x00 + && key.field4 != 0 + && key.field5 == VISIBILITYMAP_FORKNUM + && key.field6 != 0xffffffff +} + +#[inline(always)] +pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { + Ok(match key.field1 { + 0x01 => { + let kind = match key.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), + }; + let segno = key.field4; + let blknum = key.field6; + + (kind, segno, blknum) + } + _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +#[inline(always)] +pub fn is_slru_block_key(key: Key) -> bool { + key.field1 == 0x01 // SLRU-related + && key.field3 == 0x00000001 // but not SlruDir + && key.field6 != 0xffffffff // and not SlruSegSize +} + #[inline(always)] pub fn is_rel_block_key(key: &Key) -> bool { key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff } /// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`. +#[inline(always)] pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 77ce9981f0..d9dba137e4 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::import_datadir::import_wal_from_tar; use crate::metrics; use crate::metrics::LIVE_CONNECTIONS_COUNT; -use crate::pgdatadir_mapping::{rel_block_to_key, Version}; +use crate::pgdatadir_mapping::Version; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; @@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::trace::Tracer; +use pageserver_api::key::rel_block_to_key; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; use postgres_ffi::BLCKSZ; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d9cc85319e..ae182b8dc6 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -13,7 +13,12 @@ use crate::repository::*; use crate::walrecord::NeonWalRecord; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes}; -use pageserver_api::key::is_rel_block_key; +use pageserver_api::key::{ + dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key, + rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, + slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, + AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, +}; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; @@ -1535,366 +1540,6 @@ struct SlruSegmentDirectory { static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]); -// Layout of the Key address space -// -// The Key struct, used to address the underlying key-value store, consists of -// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map -// all the data and metadata keys into those 18 bytes. -// -// Principles for the mapping: -// -// - Things that are often accessed or modified together, should be close to -// each other in the key space. For example, if a relation is extended by one -// block, we create a new key-value pair for the block data, and update the -// relation size entry. Because of that, the RelSize key comes after all the -// RelBlocks of a relation: the RelSize and the last RelBlock are always next -// to each other. -// -// The key space is divided into four major sections, identified by the first -// byte, and the form a hierarchy: -// -// 00 Relation data and metadata -// -// DbDir () -> (dbnode, spcnode) -// Filenodemap -// RelDir -> relnode forknum -// RelBlocks -// RelSize -// -// 01 SLRUs -// -// SlruDir kind -// SlruSegBlocks segno -// SlruSegSize -// -// 02 pg_twophase -// -// 03 misc -// Controlfile -// checkpoint -// pg_version -// -// 04 aux files -// -// Below is a full list of the keyspace allocation: -// -// DbDir: -// 00 00000000 00000000 00000000 00 00000000 -// -// Filenodemap: -// 00 SPCNODE DBNODE 00000000 00 00000000 -// -// RelDir: -// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0) -// -// RelBlock: -// 00 SPCNODE DBNODE RELNODE FORK BLKNUM -// -// RelSize: -// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF -// -// SlruDir: -// 01 kind 00000000 00000000 00 00000000 -// -// SlruSegBlock: -// 01 kind 00000001 SEGNO 00 BLKNUM -// -// SlruSegSize: -// 01 kind 00000001 SEGNO 00 FFFFFFFF -// -// TwoPhaseDir: -// 02 00000000 00000000 00000000 00 00000000 -// -// TwoPhaseFile: -// 02 00000000 00000000 00000000 00 XID -// -// ControlFile: -// 03 00000000 00000000 00000000 00 00000000 -// -// Checkpoint: -// 03 00000000 00000000 00000000 00 00000001 -// -// AuxFiles: -// 03 00000000 00000000 00000000 00 00000002 -// - -//-- Section 01: relation data and metadata - -const DBDIR_KEY: Key = Key { - field1: 0x00, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - }..Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0xffffffff, - field5: 0xff, - field6: 0xffffffff, - } -} - -fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - } -} - -fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 1, - } -} - -pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: blknum, - } -} - -fn rel_size_to_key(rel: RelTag) -> Key { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: 0xffffffff, - } -} - -fn rel_key_range(rel: RelTag) -> Range { - Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: 0, - }..Key { - field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum + 1, - field6: 0, - } -} - -//-- Section 02: SLRUs - -fn slru_dir_to_key(kind: SlruKind) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 0, - field4: 0, - field5: 0, - field6: 0, - } -} - -fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: blknum, - } -} - -fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: 0xffffffff, - } -} - -fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { - let field2 = match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }; - - Key { - field1: 0x01, - field2, - field3: 1, - field4: segno, - field5: 0, - field6: 0, - }..Key { - field1: 0x01, - field2, - field3: 1, - field4: segno, - field5: 1, - field6: 0, - } -} - -//-- Section 03: pg_twophase - -const TWOPHASEDIR_KEY: Key = Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -fn twophase_file_key(xid: TransactionId) -> Key { - Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: xid, - } -} - -fn twophase_key_range(xid: TransactionId) -> Range { - let (next_xid, overflowed) = xid.overflowing_add(1); - - Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: xid, - }..Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: u8::from(overflowed), - field6: next_xid, - } -} - -//-- Section 03: Control file -const CONTROLFILE_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -const CHECKPOINT_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 1, -}; - -const AUX_FILES_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 2, -}; - -// Reverse mappings for a few Keys. -// These are needed by WAL redo manager. - -// AUX_FILES currently stores only data for logical replication (slots etc), and -// we don't preserve these on a branch because safekeepers can't follow timeline -// switch (and generally it likely should be optional), so ignore these. -pub fn is_inherited_key(key: Key) -> bool { - key != AUX_FILES_KEY -} - -pub fn is_rel_fsm_block_key(key: Key) -> bool { - key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff -} - -pub fn is_rel_vm_block_key(key: Key) -> bool { - key.field1 == 0x00 - && key.field4 != 0 - && key.field5 == VISIBILITYMAP_FORKNUM - && key.field6 != 0xffffffff -} - -pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> { - Ok(match key.field1 { - 0x01 => { - let kind = match key.field2 { - 0x00 => SlruKind::Clog, - 0x01 => SlruKind::MultiXactMembers, - 0x02 => SlruKind::MultiXactOffsets, - _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2), - }; - let segno = key.field4; - let blknum = key.field6; - - (kind, segno, blknum) - } - _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1), - }) -} - -fn is_slru_block_key(key: Key) -> bool { - key.field1 == 0x01 // SLRU-related - && key.field3 == 0x00000001 // but not SlruDir - && key.field6 != 0xffffffff // and not SlruSegSize -} - #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 0cb7cf26f2..69d0d0b320 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -73,8 +73,8 @@ use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; use crate::pgdatadir_mapping::CalculateLogicalSizeError; -use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::tenant::config::TenantConfOpt; +use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use pageserver_api::reltag::RelTag; use pageserver_api::shard::ShardIndex; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index f2c35436db..3183608862 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -33,11 +33,12 @@ use utils::failpoint_support; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; -use crate::pgdatadir_mapping::*; +use crate::pgdatadir_mapping::{DatadirModification, Version}; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::walrecord::*; use crate::ZERO_PAGE; +use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index b4aadb2a8c..189d77d101 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -47,11 +47,10 @@ use crate::metrics::{ WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME, }; -use crate::pgdatadir_mapping::key_to_slru_block; use crate::repository::Key; use crate::walrecord::NeonWalRecord; -use pageserver_api::key::key_to_rel_block; +use pageserver_api::key::{key_to_rel_block, key_to_slru_block}; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;