feat(pageserver): add metadata key range and aux key encoding (#7401)

Extracted from https://github.com/neondatabase/neon/pull/7375. We assume
everything >= 0x80 are metadata keys. AUX file keys are part of the
metadata keys, and we use `0x90` as the prefix for AUX file keys.

The AUX file encoding is described in the code comment. We use xxhash128
as the hash algorithm. It seems to be portable according to the
introduction,

> xxHash is an Extremely fast Hash algorithm, processing at RAM speed
limits. Code is highly portable, and produces hashes identical across
all platforms (little / big endian).

...though whether the Rust version follows the same convention is
unknown and might need manual review of the library. Anyways, we can
always change the hash algorithm before rolling it out in
staging/end-user, and I made a quick decision to use xxhash here because
it generates 128b hash + portable. We can save the discussion of which
hash algorithm to use later.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
This commit is contained in:
Alex Chi Z
2024-04-23 11:16:04 -04:00
committed by GitHub
parent 8426fb886b
commit 89f023e6b0
6 changed files with 209 additions and 1 deletions

1
Cargo.lock generated
View File

@@ -3658,6 +3658,7 @@ dependencies = [
"tokio-util",
"toml_edit",
"tracing",
"twox-hash",
"url",
"utils",
"walkdir",

View File

@@ -1,8 +1,10 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::ops::RangeInclusive;
use std::{fmt, ops::Range};
use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -21,9 +23,81 @@ pub struct Key {
pub field6: u32,
}
/// The storage key size.
pub const KEY_SIZE: usize = 18;
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
/// See [`Key::to_i128`] for more information on the encoding.
pub const METADATA_KEY_SIZE: usize = 16;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
/// The (reserved) key prefix of relation sizes.
pub const RELATION_SIZE_PREFIX: u8 = 0x81;
/// The key prefix of AUX file keys.
pub const AUX_KEY_PREFIX: u8 = 0x82;
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX
}
impl Key {
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key(&self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
assert!(is_metadata_key_slice(key), "key not in metadata key range");
Key {
field1: key[0],
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
field5: key[11],
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
}
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key(key: &[u8]) -> Self {
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
}
/// Extract a metadata key to a writer. The result should always be 16 bytes.
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
writer.put_u8(self.field1);
assert!(self.field2 <= 0xFFFF);
writer.put_u16(self.field2 as u16);
writer.put_u32(self.field3);
writer.put_u32(self.field4);
writer.put_u8(self.field5);
writer.put_u32(self.field6);
}
/// Get the range of metadata keys.
pub fn metadata_key_range() -> RangeInclusive<Self> {
Key {
field1: METADATA_KEY_BEGIN_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..=Key {
field1: u8::MAX,
field2: u16::MAX as u32,
field3: u32::MAX,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX,
}
}
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
/// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -81,6 +155,8 @@ impl Key {
key
}
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::from_metadata_key`] instead.
pub fn from_slice(b: &[u8]) -> Self {
Key {
field1: b[0],
@@ -92,6 +168,8 @@ impl Key {
}
}
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::extract_metadata_key_to_writer`] instead.
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
buf[0] = self.field1;
BE::write_u32(&mut buf[1..5], self.field2);
@@ -558,11 +636,14 @@ impl std::str::FromStr for Key {
mod tests {
use std::str::FromStr;
use crate::key::is_metadata_key_slice;
use crate::key::Key;
use rand::Rng;
use rand::SeedableRng;
use super::AUX_KEY_PREFIX;
#[test]
fn display_fromstr_bijection() {
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -578,4 +659,16 @@ mod tests {
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
}
#[test]
fn test_metadata_keys() {
let mut metadata_key = vec![AUX_KEY_PREFIX];
metadata_key.extend_from_slice(&[0xFF; 15]);
let encoded_key = Key::from_metadata_key(&metadata_key);
let mut output_key = Vec::new();
encoded_key.extract_metadata_key_to_writer(&mut output_key);
assert_eq!(metadata_key, output_key);
assert!(encoded_key.is_metadata_key());
assert!(is_metadata_key_slice(&metadata_key));
}
}

View File

@@ -70,6 +70,7 @@ tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
tracing.workspace = true
twox-hash.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true

112
pageserver/src/aux_file.rs Normal file
View File

@@ -0,0 +1,112 @@
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use tracing::warn;
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
let mut key = [0; METADATA_KEY_SIZE];
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
key[0] = AUX_KEY_PREFIX;
key[1] = dir_level1;
key[2] = dir_level2;
key[3..16].copy_from_slice(&hash[0..13]);
Key::from_metadata_key_fixed_size(&key)
}
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
/// Encode the aux file into a fixed-size key.
///
/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
/// is roughly based on the first two components of the path, one unique number for one component.
///
/// * pg_logical/mappings -> 0x0101
/// * pg_logical/snapshots -> 0x0102
/// * pg_logical/replorigin_checkpoint -> 0x0103
/// * pg_logical/others -> 0x01FF
/// * pg_replslot/ -> 0x0201
/// * others -> 0xFFFF
///
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
/// The new file type must have never been written to the storage before. Otherwise, there could be data
/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
pub fn encode_aux_file_key(path: &str) -> Key {
if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
} else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
} else if path == "pg_logical/replorigin_checkpoint" {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
} else if let Some(fname) = path.strip_prefix("pg_logical/") {
if cfg!(debug_assertions) {
warn!(
"unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
path
);
}
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
} else {
if cfg!(debug_assertions) {
warn!(
"unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
path
);
}
aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hash_portable() {
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
// if the algorithm produces the same hash across different environments.
assert_eq!(
305317690835051308206966631765527126151,
twox_hash::xxh3::hash128("test1".as_bytes())
);
assert_eq!(
85104974691013376326742244813280798847,
twox_hash::xxh3::hash128("test/test2".as_bytes())
);
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
}
#[test]
fn test_encoding_portable() {
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
// of the page server.
assert_eq!(
"8200000101E5B20C5F8DD5AA3289D6D9EAFA",
encode_aux_file_key("pg_logical/mappings/test1").to_string()
);
assert_eq!(
"820000010239AAC544893139B26F501B97E6",
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
);
assert_eq!(
"820000010300000000000000000000000000",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
);
assert_eq!(
"82000001FF8635AF2134B7266EC5B4189FD6",
encode_aux_file_key("pg_logical/unsupported").to_string()
);
assert_eq!(
"8200000201772D0E5D71DE14DA86142A1619",
encode_aux_file_key("pg_replslot/test3").to_string()
);
assert_eq!(
"820000FFFF1866EBEB53B807B26A2416F317",
encode_aux_file_key("other_file_not_supported").to_string()
);
}
}

View File

@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub use pageserver_api::keyspace;
pub mod aux_file;
pub mod metrics;
pub mod page_cache;
pub mod page_service;

View File

@@ -1402,7 +1402,7 @@ impl<'a> DatadirModification<'a> {
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value
// We already updated aux files in `self`: emit a delta and update our latest value.
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {