pageserver: move things around to prepare for decoding logic

We wish to have high level WAL decoding logic in `wal_decoder::decoder`
module. For this we need the `Value` and `NeonWalRecord` types
accessible there, so:
1. Move `Value` and `NeonWalRecord` to `pageserver_api::value` and
   `pageserver_api::record` respectively. I had to add a testing feature
   to `pageserver_api` to get this working due to `NeonWalRecord` test
   directives.
2. Get rid of `pageserver::repository` (follow up from (1))
3. Move PG specific WAL record types to `postgres_ffi::record`. In
   theory they could live in `wal_decoder`, but it would create a
   circular dependency between `wal_decoder` and `postgres_ffi`.
   Long term it makes sennse for those types to be PG version specific,
   so that will work out nicely.
4. Move higher level WAL record types (to be ingested by pageserver)
   into `wal_decoder::models`
This commit is contained in:
Vlad Lazar
2024-10-22 11:00:10 +02:00
parent 56101531c0
commit b87ac5b375
45 changed files with 985 additions and 917 deletions

View File

@@ -5,9 +5,11 @@ pub mod controller_api;
pub mod key;
pub mod keyspace;
pub mod models;
pub mod record;
pub mod reltag;
pub mod shard;
/// Public API types
pub mod upcall_api;
pub mod value;
pub mod config;

View File

@@ -0,0 +1,113 @@
//! This module defines the WAL record format used within the pageserver.
use bytes::Bytes;
use postgres_ffi::record::{describe_postgres_wal_record, MultiXactMember};
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
use serde::{Deserialize, Serialize};
use utils::bin_ser::DeserializeError;
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum NeonWalRecord {
/// Native PostgreSQL WAL record
Postgres { will_init: bool, rec: Bytes },
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
ClearVisibilityMapFlags {
new_heap_blkno: Option<u32>,
old_heap_blkno: Option<u32>,
flags: u8,
},
/// Mark transaction IDs as committed on a CLOG page
ClogSetCommitted {
xids: Vec<TransactionId>,
timestamp: TimestampTz,
},
/// Mark transaction IDs as aborted on a CLOG page
ClogSetAborted { xids: Vec<TransactionId> },
/// Extend multixact offsets SLRU
MultixactOffsetCreate {
mid: MultiXactId,
moff: MultiXactOffset,
},
/// Extend multixact members SLRU.
MultixactMembersCreate {
moff: MultiXactOffset,
members: Vec<MultiXactMember>,
},
/// Update the map of AUX files, either writing or dropping an entry
AuxFile {
file_path: String,
content: Option<Bytes>,
},
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
#[cfg(feature = "testing")]
Test {
/// Append a string to the image.
append: String,
/// Clear the image before appending.
clear: bool,
/// Treat this record as an init record. `clear` should be set to true if this field is set
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
/// its references in `timeline.rs`.
will_init: bool,
},
}
impl NeonWalRecord {
/// Does replaying this WAL record initialize the page from scratch, or does
/// it need to be applied over the previous image of the page?
pub fn will_init(&self) -> bool {
// If you change this function, you'll also need to change ValueBytes::will_init
match self {
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
#[cfg(feature = "testing")]
NeonWalRecord::Test { will_init, .. } => *will_init,
// None of the special neon record types currently initialize the page
_ => false,
}
}
#[cfg(feature = "testing")]
pub fn wal_append(s: impl AsRef<str>) -> Self {
Self::Test {
append: s.as_ref().to_string(),
clear: false,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_clear() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: false,
}
}
#[cfg(feature = "testing")]
pub fn wal_init() -> Self {
Self::Test {
append: "".to_string(),
clear: true,
will_init: true,
}
}
}
/// Build a human-readable string to describe a WAL record
///
/// For debugging purposes
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
match rec {
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
"will_init: {}, {}",
will_init,
describe_postgres_wal_record(rec)?
)),
_ => Ok(format!("{:?}", rec)),
}
}

View File

@@ -0,0 +1,228 @@
//! This module defines the value type used by the storage engine.
use crate::record::NeonWalRecord;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum Value {
/// An Image value contains a full copy of the value
Image(Bytes),
/// A WalRecord value contains a WAL record that needs to be
/// replayed get the full value. Replaying the WAL record
/// might need a previous version of the value (if will_init()
/// returns false), or it may be replayed stand-alone (true).
WalRecord(NeonWalRecord),
}
impl Value {
pub fn is_image(&self) -> bool {
matches!(self, Value::Image(_))
}
pub fn will_init(&self) -> bool {
match self {
Value::Image(_) => true,
Value::WalRecord(rec) => rec.will_init(),
}
}
}
#[derive(Debug, PartialEq)]
pub enum InvalidInput {
TooShortValue,
TooShortPostgresRecord,
}
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
/// use this type for querying if a slice looks some particular way.
pub struct ValueBytes;
impl ValueBytes {
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
if raw.len() < 12 {
return Err(InvalidInput::TooShortValue);
}
let value_discriminator = &raw[0..4];
if value_discriminator == [0, 0, 0, 0] {
// Value::Image always initializes
return Ok(true);
}
if value_discriminator != [0, 0, 0, 1] {
// not a Value::WalRecord(..)
return Ok(false);
}
let walrecord_discriminator = &raw[4..8];
if walrecord_discriminator != [0, 0, 0, 0] {
// only NeonWalRecord::Postgres can have will_init
return Ok(false);
}
if raw.len() < 17 {
return Err(InvalidInput::TooShortPostgresRecord);
}
Ok(raw[8] == 1)
}
}
#[cfg(test)]
mod test {
use super::*;
use bytes::Bytes;
use utils::bin_ser::BeSer;
macro_rules! roundtrip {
($orig:expr, $expected:expr) => {{
let orig: Value = $orig;
let actual = Value::ser(&orig).unwrap();
let expected: &[u8] = &$expected;
assert_eq!(utils::Hex(&actual), utils::Hex(expected));
let deser = Value::des(&actual).unwrap();
assert_eq!(orig, deser);
}};
}
#[test]
fn image_roundtrip() {
let image = Bytes::from_static(b"foobar");
let image = Value::Image(image);
#[rustfmt::skip]
let expected = [
// top level discriminator of 4 bytes
0x00, 0x00, 0x00, 0x00,
// 8 byte length
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
// foobar
0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
];
roundtrip!(image, expected);
assert!(ValueBytes::will_init(&expected).unwrap());
}
#[test]
fn walrecord_postgres_roundtrip() {
let rec = NeonWalRecord::Postgres {
will_init: true,
rec: Bytes::from_static(b"foobar"),
};
let rec = Value::WalRecord(rec);
#[rustfmt::skip]
let expected = [
// flattened discriminator of total 8 bytes
0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
// will_init
0x01,
// 8 byte length
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
// foobar
0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
];
roundtrip!(rec, expected);
assert!(ValueBytes::will_init(&expected).unwrap());
}
#[test]
fn bytes_inspection_too_short_image() {
let rec = Value::Image(Bytes::from_static(b""));
#[rustfmt::skip]
let expected = [
// top level discriminator of 4 bytes
0x00, 0x00, 0x00, 0x00,
// 8 byte length
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
roundtrip!(rec, expected);
assert!(ValueBytes::will_init(&expected).unwrap());
assert_eq!(expected.len(), 12);
for len in 0..12 {
assert_eq!(
ValueBytes::will_init(&expected[..len]).unwrap_err(),
InvalidInput::TooShortValue
);
}
}
#[test]
fn bytes_inspection_too_short_postgres_record() {
let rec = NeonWalRecord::Postgres {
will_init: false,
rec: Bytes::from_static(b""),
};
let rec = Value::WalRecord(rec);
#[rustfmt::skip]
let expected = [
// flattened discriminator of total 8 bytes
0x00, 0x00, 0x00, 0x01,
0x00, 0x00, 0x00, 0x00,
// will_init
0x00,
// 8 byte length
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
roundtrip!(rec, expected);
assert!(!ValueBytes::will_init(&expected).unwrap());
assert_eq!(expected.len(), 17);
for len in 12..17 {
assert_eq!(
ValueBytes::will_init(&expected[..len]).unwrap_err(),
InvalidInput::TooShortPostgresRecord
)
}
for len in 0..12 {
assert_eq!(
ValueBytes::will_init(&expected[..len]).unwrap_err(),
InvalidInput::TooShortValue
)
}
}
#[test]
fn clear_visibility_map_flags_example() {
let rec = NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: Some(0x11),
old_heap_blkno: None,
flags: 0x03,
};
let rec = Value::WalRecord(rec);
#[rustfmt::skip]
let expected = [
// discriminators
0x00, 0x00, 0x00, 0x01,
0x00, 0x00, 0x00, 0x01,
// Some == 1 followed by 4 bytes
0x01, 0x00, 0x00, 0x00, 0x11,
// None == 0
0x00,
// flags
0x03
];
roundtrip!(rec, expected);
assert!(!ValueBytes::will_init(&expected).unwrap());
}
}