diff --git a/Cargo.lock b/Cargo.lock index cff07239e7..85a59ec0ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3658,6 +3658,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "twox-hash", "url", "utils", "walkdir", diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 1d66dd8878..01919e8325 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -1,8 +1,10 @@ use anyhow::{bail, Result}; use byteorder::{ByteOrder, BE}; +use bytes::BufMut; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::{Oid, TransactionId}; use serde::{Deserialize, Serialize}; +use std::ops::RangeInclusive; use std::{fmt, ops::Range}; use crate::reltag::{BlockNumber, RelTag, SlruKind}; @@ -21,9 +23,81 @@ pub struct Key { pub field6: u32, } +/// The storage key size. pub const KEY_SIZE: usize = 18; +/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized. +/// See [`Key::to_i128`] for more information on the encoding. +pub const METADATA_KEY_SIZE: usize = 16; + +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key. +pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80; + +/// The (reserved) key prefix of relation sizes. +pub const RELATION_SIZE_PREFIX: u8 = 0x81; + +/// The key prefix of AUX file keys. +pub const AUX_KEY_PREFIX: u8 = 0x82; + +/// Check if the key falls in the range of metadata keys. +pub const fn is_metadata_key_slice(key: &[u8]) -> bool { + key[0] >= METADATA_KEY_BEGIN_PREFIX +} + impl Key { + /// Check if the key falls in the range of metadata keys. + pub const fn is_metadata_key(&self) -> bool { + self.field1 >= METADATA_KEY_BEGIN_PREFIX + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self { + assert!(is_metadata_key_slice(key), "key not in metadata key range"); + Key { + field1: key[0], + field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32, + field3: u32::from_be_bytes(key[3..7].try_into().unwrap()), + field4: u32::from_be_bytes(key[7..11].try_into().unwrap()), + field5: key[11], + field6: u32::from_be_bytes(key[12..16].try_into().unwrap()), + } + } + + /// Encode a metadata key to a storage key. + pub fn from_metadata_key(key: &[u8]) -> Self { + Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key")) + } + + /// Extract a metadata key to a writer. The result should always be 16 bytes. + pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) { + writer.put_u8(self.field1); + assert!(self.field2 <= 0xFFFF); + writer.put_u16(self.field2 as u16); + writer.put_u32(self.field3); + writer.put_u32(self.field4); + writer.put_u8(self.field5); + writer.put_u32(self.field6); + } + + /// Get the range of metadata keys. + pub fn metadata_key_range() -> RangeInclusive { + Key { + field1: METADATA_KEY_BEGIN_PREFIX, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + }..=Key { + field1: u8::MAX, + field2: u16::MAX as u32, + field3: u32::MAX, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + } + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 @@ -81,6 +155,8 @@ impl Key { key } + /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::from_metadata_key`] instead. pub fn from_slice(b: &[u8]) -> Self { Key { field1: b[0], @@ -92,6 +168,8 @@ impl Key { } } + /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently. + /// Use [`Key::extract_metadata_key_to_writer`] instead. pub fn write_to_byte_slice(&self, buf: &mut [u8]) { buf[0] = self.field1; BE::write_u32(&mut buf[1..5], self.field2); @@ -558,11 +636,14 @@ impl std::str::FromStr for Key { mod tests { use std::str::FromStr; + use crate::key::is_metadata_key_slice; use crate::key::Key; use rand::Rng; use rand::SeedableRng; + use super::AUX_KEY_PREFIX; + #[test] fn display_fromstr_bijection() { let mut rng = rand::rngs::StdRng::seed_from_u64(42); @@ -578,4 +659,16 @@ mod tests { assert_eq!(key, Key::from_str(&format!("{key}")).unwrap()); } + + #[test] + fn test_metadata_keys() { + let mut metadata_key = vec![AUX_KEY_PREFIX]; + metadata_key.extend_from_slice(&[0xFF; 15]); + let encoded_key = Key::from_metadata_key(&metadata_key); + let mut output_key = Vec::new(); + encoded_key.extract_metadata_key_to_writer(&mut output_key); + assert_eq!(metadata_key, output_key); + assert!(encoded_key.is_metadata_key()); + assert!(is_metadata_key_slice(&metadata_key)); + } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 7a11610a91..4335f38f1e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -70,6 +70,7 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +twox-hash.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/src/aux_file.rs b/pageserver/src/aux_file.rs new file mode 100644 index 0000000000..aba4ccf19d --- /dev/null +++ b/pageserver/src/aux_file.rs @@ -0,0 +1,112 @@ +use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE}; +use tracing::warn; + +/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash]. +fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key { + let mut key = [0; METADATA_KEY_SIZE]; + let hash = twox_hash::xxh3::hash128(data).to_be_bytes(); + key[0] = AUX_KEY_PREFIX; + key[1] = dir_level1; + key[2] = dir_level2; + key[3..16].copy_from_slice(&hash[0..13]); + Key::from_metadata_key_fixed_size(&key) +} + +const AUX_DIR_PG_LOGICAL: u8 = 0x01; +const AUX_DIR_PG_REPLSLOT: u8 = 0x02; +const AUX_DIR_PG_UNKNOWN: u8 = 0xFF; + +/// Encode the aux file into a fixed-size key. +/// +/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type. +/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path +/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix +/// is roughly based on the first two components of the path, one unique number for one component. +/// +/// * pg_logical/mappings -> 0x0101 +/// * pg_logical/snapshots -> 0x0102 +/// * pg_logical/replorigin_checkpoint -> 0x0103 +/// * pg_logical/others -> 0x01FF +/// * pg_replslot/ -> 0x0201 +/// * others -> 0xFFFF +/// +/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`. +/// The new file type must have never been written to the storage before. Otherwise, there could be data +/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix. +pub fn encode_aux_file_key(path: &str) -> Key { + if let Some(fname) = path.strip_prefix("pg_logical/mappings/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes()) + } else if path == "pg_logical/replorigin_checkpoint" { + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"") + } else if let Some(fname) = path.strip_prefix("pg_logical/") { + if cfg!(debug_assertions) { + warn!( + "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes()) + } else if let Some(fname) = path.strip_prefix("pg_replslot/") { + aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes()) + } else { + if cfg!(debug_assertions) { + warn!( + "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning", + path + ); + } + aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_hash_portable() { + // AUX file encoding requires the hash to be portable across all platforms. This test case checks + // if the algorithm produces the same hash across different environments. + assert_eq!( + 305317690835051308206966631765527126151, + twox_hash::xxh3::hash128("test1".as_bytes()) + ); + assert_eq!( + 85104974691013376326742244813280798847, + twox_hash::xxh3::hash128("test/test2".as_bytes()) + ); + assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes())); + } + + #[test] + fn test_encoding_portable() { + // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions + // of the page server. + assert_eq!( + "8200000101E5B20C5F8DD5AA3289D6D9EAFA", + encode_aux_file_key("pg_logical/mappings/test1").to_string() + ); + assert_eq!( + "820000010239AAC544893139B26F501B97E6", + encode_aux_file_key("pg_logical/snapshots/test2").to_string() + ); + assert_eq!( + "820000010300000000000000000000000000", + encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string() + ); + assert_eq!( + "82000001FF8635AF2134B7266EC5B4189FD6", + encode_aux_file_key("pg_logical/unsupported").to_string() + ); + assert_eq!( + "8200000201772D0E5D71DE14DA86142A1619", + encode_aux_file_key("pg_replslot/test3").to_string() + ); + assert_eq!( + "820000FFFF1866EBEB53B807B26A2416F317", + encode_aux_file_key("other_file_not_supported").to_string() + ); + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f947a75f61..930700e50c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub use pageserver_api::keyspace; +pub mod aux_file; pub mod metrics; pub mod page_cache; pub mod page_service; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 4a9682dcac..c733b38acb 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1402,7 +1402,7 @@ impl<'a> DatadirModification<'a> { let n_files; let mut aux_files = self.tline.aux_files.lock().await; if let Some(mut dir) = aux_files.dir.take() { - // We already updated aux files in `self`: emit a delta and update our latest value + // We already updated aux files in `self`: emit a delta and update our latest value. dir.upsert(file_path.clone(), content.clone()); n_files = dir.files.len(); if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {