Eliminate dependency from pageserver_api to postgres_ffi (#12273)

Introduce a separate `postgres_ffi_types` crate which contains a few types and functions that were used in the API. `postgres_ffi_types` is a much small crate than `postgres_ffi`, and it doesn't depend on bindgen or the Postgres C headers. Move NeonWalRecord and Value types to wal_decoder crate. They are only used in the pageserver-safekeeper "ingest" API. The rest of the ingest API types are defined in wal_decoder, so move these there as well.
2026-01-09 06:22:57 +00:00 · 2025-06-19 13:31:27 +03:00
parent 2ca6665f4a
commit 1950ccfe33
44 changed files with 183 additions and 128 deletions
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -4,8 +4,8 @@ use std::ops::Range;
 use anyhow::{Result, bail};
 use byteorder::{BE, ByteOrder};
 use bytes::Bytes;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi::{Oid, RepOriginId};
+use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi_types::{Oid, RepOriginId};
 use serde::{Deserialize, Serialize};
 use utils::const_assert;

@@ -194,7 +194,7 @@ impl Key {
    /// will be rejected on the write path.
    #[allow(dead_code)]
    pub fn is_valid_key_on_write_path_strong(&self) -> bool {
-        use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
+        use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
        if !self.is_i128_representable() {
            return false;
        }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,7 +1,6 @@
 use std::ops::Range;

 use itertools::Itertools;
-use postgres_ffi::BLCKSZ;

 use crate::key::Key;
 use crate::shard::{ShardCount, ShardIdentity};
@@ -269,9 +268,13 @@ impl KeySpace {
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
-    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
-        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
+    pub fn partition(
+        &self,
+        shard_identity: &ShardIdentity,
+        target_size: u64,
+        block_size: u64,
+    ) -> KeyPartitioning {
+        let target_nblocks = (target_size / block_size) as u32;

        let mut parts = Vec::new();
        let mut current_part = Vec::new();
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -6,11 +6,9 @@ pub mod key;
 pub mod keyspace;
 pub mod models;
 pub mod pagestream_api;
-pub mod record;
 pub mod reltag;
 pub mod shard;
 /// Public API types
 pub mod upcall_api;
-pub mod value;

 pub mod config;
--- a/libs/pageserver_api/src/pagestream_api.rs
+++ b/libs/pageserver_api/src/pagestream_api.rs
@@ -8,9 +8,15 @@ use crate::reltag::RelTag;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

+/// Block size.
+///
+/// XXX: We assume 8k block size in the SLRU fetch API. It's not great to hardcode
+/// that in the protocol, because Postgres supports different block sizes as a compile
+/// time option.
+const BLCKSZ: usize = 8192;
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
@@ -443,7 +449,7 @@ impl PagestreamBeMessage {

                    Self::GetSlruSegment(resp) => {
                        bytes.put_u8(Tag::GetSlruSegment as u8);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ) as u32);
                        bytes.put(&resp.segment[..]);
                    }

@@ -520,7 +526,7 @@ impl PagestreamBeMessage {
                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
                        bytes.put_u8(resp.req.kind);
                        bytes.put_u32(resp.req.segno);
-                        bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                        bytes.put_u32((resp.segment.len() / BLCKSZ) as u32);
                        bytes.put(&resp.segment[..]);
                    }

@@ -662,7 +668,7 @@ impl PagestreamBeMessage {
                    let kind = buf.read_u8()?;
                    let segno = buf.read_u32::<BigEndian>()?;
                    let n_blocks = buf.read_u32::<BigEndian>()?;
-                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
+                    let mut segment = vec![0; n_blocks as usize * BLCKSZ];
                    buf.read_exact(&mut segment)?;
                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
                        req: PagestreamGetSlruSegmentRequest {
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -1,133 +0,0 @@
-//! This module defines the WAL record format used within the pageserver.
-
-use bytes::Bytes;
-use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record};
-use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
-use serde::{Deserialize, Serialize};
-use utils::bin_ser::DeserializeError;
-
-/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
-/// around a PostgreSQL WAL record, or a custom neon-specific "record".
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub enum NeonWalRecord {
-    /// Native PostgreSQL WAL record
-    Postgres { will_init: bool, rec: Bytes },
-
-    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
-    ClearVisibilityMapFlags {
-        new_heap_blkno: Option<u32>,
-        old_heap_blkno: Option<u32>,
-        flags: u8,
-    },
-    /// Mark transaction IDs as committed on a CLOG page
-    ClogSetCommitted {
-        xids: Vec<TransactionId>,
-        timestamp: TimestampTz,
-    },
-    /// Mark transaction IDs as aborted on a CLOG page
-    ClogSetAborted { xids: Vec<TransactionId> },
-    /// Extend multixact offsets SLRU
-    MultixactOffsetCreate {
-        mid: MultiXactId,
-        moff: MultiXactOffset,
-    },
-    /// Extend multixact members SLRU.
-    MultixactMembersCreate {
-        moff: MultiXactOffset,
-        members: Vec<MultiXactMember>,
-    },
-    /// Update the map of AUX files, either writing or dropping an entry
-    AuxFile {
-        file_path: String,
-        content: Option<Bytes>,
-    },
-    // Truncate visibility map page
-    TruncateVisibilityMap {
-        trunc_byte: usize,
-        trunc_offs: usize,
-    },
-
-    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
-    #[cfg(feature = "testing")]
-    Test {
-        /// Append a string to the image.
-        append: String,
-        /// Clear the image before appending.
-        clear: bool,
-        /// Treat this record as an init record. `clear` should be set to true if this field is set
-        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
-        /// its references in `timeline.rs`.
-        will_init: bool,
-        /// Only append the record if the current image is the same as the one specified in this field.
-        only_if: Option<String>,
-    },
-}
-
-impl NeonWalRecord {
-    /// Does replaying this WAL record initialize the page from scratch, or does
-    /// it need to be applied over the previous image of the page?
-    pub fn will_init(&self) -> bool {
-        // If you change this function, you'll also need to change ValueBytes::will_init
-        match self {
-            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-            #[cfg(feature = "testing")]
-            NeonWalRecord::Test { will_init, .. } => *will_init,
-            // None of the special neon record types currently initialize the page
-            _ => false,
-        }
-    }
-
-    #[cfg(feature = "testing")]
-    pub fn wal_append(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: false,
-            will_init: false,
-            only_if: None,
-        }
-    }
-
-    #[cfg(feature = "testing")]
-    pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: false,
-            will_init: false,
-            only_if: Some(only_if.as_ref().to_string()),
-        }
-    }
-
-    #[cfg(feature = "testing")]
-    pub fn wal_clear(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: true,
-            will_init: false,
-            only_if: None,
-        }
-    }
-
-    #[cfg(feature = "testing")]
-    pub fn wal_init(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: true,
-            will_init: true,
-            only_if: None,
-        }
-    }
-}
-
-/// Build a human-readable string to describe a WAL record
-///
-/// For debugging purposes
-pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
-    match rec {
-        NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
-            "will_init: {}, {}",
-            will_init,
-            describe_postgres_wal_record(rec)?
-        )),
-        _ => Ok(format!("{:?}", rec)),
-    }
-}
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -1,9 +1,9 @@
 use std::cmp::Ordering;
 use std::fmt;

-use postgres_ffi::Oid;
-use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
-use postgres_ffi::relfile_utils::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
+use postgres_ffi_types::Oid;
+use postgres_ffi_types::constants::GLOBALTABLESPACE_OID;
+use postgres_ffi_types::forknum::{MAIN_FORKNUM, forkname_to_number, forknumber_to_name};
 use serde::{Deserialize, Serialize};

 ///
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -35,7 +35,7 @@ use std::hash::{Hash, Hasher};

 #[doc(inline)]
 pub use ::utils::shard::*;
-use postgres_ffi::relfile_utils::INIT_FORKNUM;
+use postgres_ffi_types::forknum::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};

 use crate::key::Key;
--- a/libs/pageserver_api/src/value.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -1,257 +0,0 @@
-//! This module defines the value type used by the storage engine.
-//!
-//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
-//! or a "delta" of how to get from previous version of the value to the new one
-//! ([`Value::WalRecord`]])
-//!
-//! Note that the [`Value`] type is used for the permananent storage format, so any
-//! changes to it must be backwards compatible.
-
-use bytes::Bytes;
-use serde::{Deserialize, Serialize};
-
-use crate::record::NeonWalRecord;
-
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub enum Value {
-    /// An Image value contains a full copy of the value
-    Image(Bytes),
-    /// A WalRecord value contains a WAL record that needs to be
-    /// replayed get the full value. Replaying the WAL record
-    /// might need a previous version of the value (if will_init()
-    /// returns false), or it may be replayed stand-alone (true).
-    WalRecord(NeonWalRecord),
-}
-
-impl Value {
-    #[inline(always)]
-    pub fn is_image(&self) -> bool {
-        matches!(self, Value::Image(_))
-    }
-
-    #[inline(always)]
-    pub fn will_init(&self) -> bool {
-        match self {
-            Value::Image(_) => true,
-            Value::WalRecord(rec) => rec.will_init(),
-        }
-    }
-
-    #[inline(always)]
-    pub fn estimated_size(&self) -> usize {
-        match self {
-            Value::Image(image) => image.len(),
-            Value::WalRecord(NeonWalRecord::AuxFile {
-                content: Some(content),
-                ..
-            }) => content.len(),
-            Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(),
-            Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4,
-            Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4,
-            Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => {
-                members.len() * 8
-            }
-            _ => 8192, /* use image size as the estimation */
-        }
-    }
-}
-
-#[derive(Debug, PartialEq)]
-pub enum InvalidInput {
-    TooShortValue,
-    TooShortPostgresRecord,
-}
-
-/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
-/// use this type for querying if a slice looks some particular way.
-pub struct ValueBytes;
-
-impl ValueBytes {
-    #[inline(always)]
-    pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
-        if raw.len() < 12 {
-            return Err(InvalidInput::TooShortValue);
-        }
-
-        let value_discriminator = &raw[0..4];
-
-        if value_discriminator == [0, 0, 0, 0] {
-            // Value::Image always initializes
-            return Ok(true);
-        }
-
-        if value_discriminator != [0, 0, 0, 1] {
-            // not a Value::WalRecord(..)
-            return Ok(false);
-        }
-
-        let walrecord_discriminator = &raw[4..8];
-
-        if walrecord_discriminator != [0, 0, 0, 0] {
-            // only NeonWalRecord::Postgres can have will_init
-            return Ok(false);
-        }
-
-        if raw.len() < 17 {
-            return Err(InvalidInput::TooShortPostgresRecord);
-        }
-
-        Ok(raw[8] == 1)
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use bytes::Bytes;
-    use utils::bin_ser::BeSer;
-
-    use super::*;
-
-    macro_rules! roundtrip {
-        ($orig:expr, $expected:expr) => {{
-            let orig: Value = $orig;
-
-            let actual = Value::ser(&orig).unwrap();
-            let expected: &[u8] = &$expected;
-
-            assert_eq!(utils::Hex(&actual), utils::Hex(expected));
-
-            let deser = Value::des(&actual).unwrap();
-
-            assert_eq!(orig, deser);
-        }};
-    }
-
-    #[test]
-    fn image_roundtrip() {
-        let image = Bytes::from_static(b"foobar");
-        let image = Value::Image(image);
-
-        #[rustfmt::skip]
-        let expected = [
-            // top level discriminator of 4 bytes
-            0x00, 0x00, 0x00, 0x00,
-            // 8 byte length
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
-            // foobar
-            0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
-        ];
-
-        roundtrip!(image, expected);
-
-        assert!(ValueBytes::will_init(&expected).unwrap());
-    }
-
-    #[test]
-    fn walrecord_postgres_roundtrip() {
-        let rec = NeonWalRecord::Postgres {
-            will_init: true,
-            rec: Bytes::from_static(b"foobar"),
-        };
-        let rec = Value::WalRecord(rec);
-
-        #[rustfmt::skip]
-        let expected = [
-            // flattened discriminator of total 8 bytes
-            0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
-            // will_init
-            0x01,
-            // 8 byte length
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06,
-            // foobar
-            0x66, 0x6f, 0x6f, 0x62, 0x61, 0x72
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(ValueBytes::will_init(&expected).unwrap());
-    }
-
-    #[test]
-    fn bytes_inspection_too_short_image() {
-        let rec = Value::Image(Bytes::from_static(b""));
-
-        #[rustfmt::skip]
-        let expected = [
-            // top level discriminator of 4 bytes
-            0x00, 0x00, 0x00, 0x00,
-            // 8 byte length
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(ValueBytes::will_init(&expected).unwrap());
-        assert_eq!(expected.len(), 12);
-        for len in 0..12 {
-            assert_eq!(
-                ValueBytes::will_init(&expected[..len]).unwrap_err(),
-                InvalidInput::TooShortValue
-            );
-        }
-    }
-
-    #[test]
-    fn bytes_inspection_too_short_postgres_record() {
-        let rec = NeonWalRecord::Postgres {
-            will_init: false,
-            rec: Bytes::from_static(b""),
-        };
-        let rec = Value::WalRecord(rec);
-
-        #[rustfmt::skip]
-        let expected = [
-            // flattened discriminator of total 8 bytes
-            0x00, 0x00, 0x00, 0x01,
-            0x00, 0x00, 0x00, 0x00,
-            // will_init
-            0x00,
-            // 8 byte length
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(!ValueBytes::will_init(&expected).unwrap());
-        assert_eq!(expected.len(), 17);
-        for len in 12..17 {
-            assert_eq!(
-                ValueBytes::will_init(&expected[..len]).unwrap_err(),
-                InvalidInput::TooShortPostgresRecord
-            )
-        }
-        for len in 0..12 {
-            assert_eq!(
-                ValueBytes::will_init(&expected[..len]).unwrap_err(),
-                InvalidInput::TooShortValue
-            )
-        }
-    }
-
-    #[test]
-    fn clear_visibility_map_flags_example() {
-        let rec = NeonWalRecord::ClearVisibilityMapFlags {
-            new_heap_blkno: Some(0x11),
-            old_heap_blkno: None,
-            flags: 0x03,
-        };
-        let rec = Value::WalRecord(rec);
-
-        #[rustfmt::skip]
-        let expected = [
-            // discriminators
-            0x00, 0x00, 0x00, 0x01,
-            0x00, 0x00, 0x00, 0x01,
-            // Some == 1 followed by 4 bytes
-            0x01, 0x00, 0x00, 0x00, 0x11,
-            // None == 0
-            0x00,
-            // flags
-            0x03
-        ];
-
-        roundtrip!(rec, expected);
-
-        assert!(!ValueBytes::will_init(&expected).unwrap());
-    }
-}