Merge remote-tracking branch 'origin/main' into vlad/read-path-concurrent-io

This commit is contained in:
Christian Schwarz
2025-01-20 19:46:44 +01:00
100 changed files with 5316 additions and 1317 deletions

View File

@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum SkSchedulingPolicy {
Active,
Disabled,
Pause,
Decomissioned,
}
@@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy {
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"active" => Self::Active,
"disabled" => Self::Disabled,
"pause" => Self::Pause,
"decomissioned" => Self::Decomissioned,
_ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
_ => {
return Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,pause,decomissioned"
))
}
})
}
}
@@ -346,7 +350,7 @@ impl From<SkSchedulingPolicy> for String {
use SkSchedulingPolicy::*;
match value {
Active => "active",
Disabled => "disabled",
Pause => "pause",
Decomissioned => "decomissioned",
}
.to_string()
@@ -416,8 +420,6 @@ pub struct MetadataHealthListOutdatedResponse {
}
/// Publicly exposed safekeeper description
///
/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperDescribeResponse {
pub id: NodeId,
@@ -433,6 +435,11 @@ pub struct SafekeeperDescribeResponse {
pub scheduling_policy: SkSchedulingPolicy,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SafekeeperSchedulingPolicyRequest {
pub scheduling_policy: SkSchedulingPolicy,
}
#[cfg(test)]
mod test {
use super::*;

View File

@@ -24,7 +24,9 @@ pub struct Key {
/// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
/// a struct of fields.
#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
#[derive(
Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
)]
pub struct CompactKey(i128);
/// The storage key size.

View File

@@ -29,7 +29,7 @@ use utils::{
};
use crate::{
key::Key,
key::{CompactKey, Key},
reltag::RelTag,
shard::{ShardCount, ShardStripeSize, TenantShardId},
};
@@ -2115,6 +2115,23 @@ impl PagestreamBeMessage {
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct PageTraceEvent {
pub key: CompactKey,
pub effective_lsn: Lsn,
pub time: SystemTime,
}
impl Default for PageTraceEvent {
fn default() -> Self {
Self {
key: Default::default(),
effective_lsn: Default::default(),
time: std::time::UNIX_EPOCH,
}
}
}
#[cfg(test)]
mod tests {
use serde_json::json;

View File

@@ -31,6 +31,8 @@
//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
//! and their slugs are 0004, 0104, 0204, and 0304.
use std::hash::{Hash, Hasher};
use crate::{key::Key, models::ShardParameters};
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize};
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
layout: ShardLayout,
}
/// Hash implementation
///
/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
impl Hash for ShardIdentity {
fn hash<H: Hasher>(&self, state: &mut H) {
let ShardIdentity {
number,
count,
stripe_size: _,
layout: _,
} = self;
number.0.hash(state);
count.0.hash(state);
}
}
/// Stripe size in number of pages
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardStripeSize(pub u32);
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
}
/// Layout version: for future upgrades where we might change how the key->shard mapping works
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
pub struct ShardLayout(u8);
const LAYOUT_V1: ShardLayout = ShardLayout(1);

View File

@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
use utils::lsn::Lsn;
#[repr(C)]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlMultiXactCreate {
pub mid: MultiXactId,
/* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
}
#[repr(C)]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlMultiXactTruncate {
pub oldest_multi_db: Oid,
/* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
}
#[repr(C)]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlRelmapUpdate {
pub dbid: Oid, /* database ID, or 0 for shared map */
pub tsid: Oid, /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
}
#[repr(C)]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlReploriginDrop {
pub node_id: RepOriginId,
}
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
}
#[repr(C)]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlReploriginSet {
pub remote_lsn: Lsn,
pub node_id: RepOriginId,
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
}
#[repr(C)]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlSmgrTruncate {
pub blkno: BlockNumber,
pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
/// struct for commits and aborts.
///
#[derive(Debug, Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct XlXactParsedRecord {
pub xid: TransactionId,
pub info: u8,

View File

@@ -5,9 +5,12 @@ edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
const_format.workspace = true
serde.workspace = true
serde_json.workspace = true
postgres_ffi.workspace = true
pq_proto.workspace = true
tokio.workspace = true
utils.workspace = true
pageserver_api.workspace = true

View File

@@ -4,12 +4,15 @@ use const_format::formatcp;
use pq_proto::SystemId;
use serde::{Deserialize, Serialize};
pub mod membership;
/// Public API types
pub mod models;
/// Consensus logical timestamp. Note: it is a part of sk control file.
pub type Term = u64;
pub const INVALID_TERM: Term = 0;
/// With this term timeline is created initially. It
/// is a normal term except wp is never elected with it.
pub const INITIAL_TERM: Term = 0;
/// Information about Postgres. Safekeeper gets it once and then verifies all
/// further connections from computes match. Note: it is a part of sk control

View File

@@ -0,0 +1,166 @@
//! Types defining safekeeper membership, see
//! rfcs/035-safekeeper-dynamic-membership-change.md
//! for details.
use std::{collections::HashSet, fmt::Display};
use anyhow;
use anyhow::bail;
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
/// Number uniquely identifying safekeeper configuration.
/// Note: it is a part of sk control file.
pub type Generation = u32;
/// 1 is the first valid generation, 0 is used as
/// a placeholder before we fully migrate to generations.
pub const INVALID_GENERATION: Generation = 0;
pub const INITIAL_GENERATION: Generation = 1;
/// Membership is defined by ids so e.g. walproposer uses them to figure out
/// quorums, but we also carry host and port to give wp idea where to connect.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafekeeperId {
pub id: NodeId,
pub host: String,
/// We include here only port for computes -- that is, pg protocol tenant
/// only port, or wide pg protocol port if the former is not configured.
pub pg_port: u16,
}
impl Display for SafekeeperId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
}
}
/// Set of safekeepers.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(transparent)]
pub struct MemberSet {
pub members: Vec<SafekeeperId>,
}
impl MemberSet {
pub fn empty() -> Self {
MemberSet {
members: Vec::new(),
}
}
pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
if hs.len() != members.len() {
bail!("duplicate safekeeper id in the set {:?}", members);
}
Ok(MemberSet { members })
}
pub fn contains(&self, sk: &SafekeeperId) -> bool {
self.members.iter().any(|m| m.id == sk.id)
}
pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
if self.contains(&sk) {
bail!(format!(
"sk {} is already member of the set {}",
sk.id, self
));
}
self.members.push(sk);
Ok(())
}
}
impl Display for MemberSet {
/// Display as a comma separated list of members.
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let sks_str = self
.members
.iter()
.map(|m| m.to_string())
.collect::<Vec<_>>();
write!(f, "({})", sks_str.join(", "))
}
}
/// Safekeeper membership configuration.
/// Note: it is a part of both control file and http API.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Configuration {
/// Unique id.
pub generation: Generation,
/// Current members of the configuration.
pub members: MemberSet,
/// Some means it is a joint conf.
pub new_members: Option<MemberSet>,
}
impl Configuration {
/// Used for pre-generations timelines, will be removed eventually.
pub fn empty() -> Self {
Configuration {
generation: INVALID_GENERATION,
members: MemberSet::empty(),
new_members: None,
}
}
}
impl Display for Configuration {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"gen={}, members={}, new_members={}",
self.generation,
self.members,
self.new_members
.as_ref()
.map(ToString::to_string)
.unwrap_or(String::from("none"))
)
}
}
#[cfg(test)]
mod tests {
use super::{MemberSet, SafekeeperId};
use utils::id::NodeId;
#[test]
fn test_member_set() {
let mut members = MemberSet::empty();
members
.add(SafekeeperId {
id: NodeId(42),
host: String::from("lala.org"),
pg_port: 5432,
})
.unwrap();
members
.add(SafekeeperId {
id: NodeId(42),
host: String::from("lala.org"),
pg_port: 5432,
})
.expect_err("duplicate must not be allowed");
members
.add(SafekeeperId {
id: NodeId(43),
host: String::from("bubu.org"),
pg_port: 5432,
})
.unwrap();
println!("members: {}", members);
let j = serde_json::to_string(&members).expect("failed to serialize");
println!("members json: {}", j);
assert_eq!(
j,
r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
);
}
}

View File

@@ -1,5 +1,6 @@
//! Types used in safekeeper http API. Many of them are also reused internally.
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::TimestampTz;
use serde::{Deserialize, Serialize};
use std::net::SocketAddr;
@@ -11,7 +12,7 @@ use utils::{
pageserver_feedback::PageserverFeedback,
};
use crate::{ServerInfo, Term};
use crate::{membership::Configuration, ServerInfo, Term};
#[derive(Debug, Serialize)]
pub struct SafekeeperStatus {
@@ -22,13 +23,16 @@ pub struct SafekeeperStatus {
pub struct TimelineCreateRequest {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub peer_ids: Option<Vec<NodeId>>,
pub mconf: Configuration,
pub pg_version: u32,
pub system_id: Option<u64>,
// By default WAL_SEGMENT_SIZE
pub wal_seg_size: Option<u32>,
pub commit_lsn: Lsn,
// If not passed, it is assigned to the beginning of commit_lsn segment.
pub local_start_lsn: Option<Lsn>,
pub start_lsn: Lsn,
// Normal creation should omit this field (start_lsn initializes all LSNs).
// However, we allow specifying custom value higher than start_lsn for
// manual recovery case, see test_s3_wal_replay.
pub commit_lsn: Option<Lsn>,
}
/// Same as TermLsn, but serializes LSN using display serializer
@@ -143,7 +147,13 @@ pub type ConnectionId = u32;
/// Serialize is used only for json'ing in API response. Also used internally.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalSenderState {
pub enum WalSenderState {
Vanilla(VanillaWalSenderState),
Interpreted(InterpretedWalSenderState),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VanillaWalSenderState {
pub ttid: TenantTimelineId,
pub addr: SocketAddr,
pub conn_id: ConnectionId,
@@ -152,6 +162,17 @@ pub struct WalSenderState {
pub feedback: ReplicationFeedback,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InterpretedWalSenderState {
pub ttid: TenantTimelineId,
pub shard: ShardIdentity,
pub addr: SocketAddr,
pub conn_id: ConnectionId,
// postgres application_name
pub appname: Option<String>,
pub feedback: ReplicationFeedback,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalReceiverState {
/// None means it is recovery initiated by us (this safekeeper).
@@ -172,6 +193,7 @@ pub enum WalReceiverStatus {
pub struct TimelineStatus {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub mconf: Configuration,
pub acceptor_state: AcceptorStateStatus,
pub pg_info: ServerInfo,
pub flush_lsn: Lsn,
@@ -186,6 +208,20 @@ pub struct TimelineStatus {
pub walreceivers: Vec<WalReceiverState>,
}
/// Request to switch membership configuration.
#[derive(Serialize, Deserialize)]
#[serde(transparent)]
pub struct TimelineMembershipSwitchRequest {
pub mconf: Configuration,
}
/// In response both previous and current configuration are sent.
#[derive(Serialize, Deserialize)]
pub struct TimelineMembershipSwitchResponse {
pub previous_conf: Configuration,
pub current_conf: Configuration,
}
fn lsn_invalid() -> Lsn {
Lsn::INVALID
}
@@ -241,3 +277,8 @@ pub struct TimelineTermBumpResponse {
pub previous_term: u64,
pub current_term: u64,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SafekeeperUtilization {
pub timeline_count: u64,
}

View File

@@ -0,0 +1,54 @@
//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
//! don't block reads.
use arc_swap::ArcSwap;
use std::sync::Arc;
use tokio::sync::TryLockError;
pub struct GuardArcSwap<T> {
inner: ArcSwap<T>,
guard: tokio::sync::Mutex<()>,
}
pub struct Guard<'a, T> {
_guard: tokio::sync::MutexGuard<'a, ()>,
inner: &'a ArcSwap<T>,
}
impl<T> GuardArcSwap<T> {
pub fn new(inner: T) -> Self {
Self {
inner: ArcSwap::new(Arc::new(inner)),
guard: tokio::sync::Mutex::new(()),
}
}
pub fn read(&self) -> Arc<T> {
self.inner.load_full()
}
pub async fn write_guard(&self) -> Guard<'_, T> {
Guard {
_guard: self.guard.lock().await,
inner: &self.inner,
}
}
pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
let guard = self.guard.try_lock()?;
Ok(Guard {
_guard: guard,
inner: &self.inner,
})
}
}
impl<T> Guard<'_, T> {
pub fn read(&self) -> Arc<T> {
self.inner.load_full()
}
pub fn write(&mut self, value: T) {
self.inner.store(Arc::new(value));
}
}

View File

@@ -98,6 +98,8 @@ pub mod try_rcu;
pub mod pprof;
pub mod guard_arc_swap;
// Re-export used in macro. Avoids adding git-version as dep in target crates.
#[doc(hidden)]
pub use git_version;

View File

@@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[build-dependencies]
tonic-build.workspace = true
[dev-dependencies]
criterion.workspace = true
camino.workspace = true
camino-tempfile.workspace = true
remote_storage.workspace = true
tokio-util.workspace = true
serde_json.workspace = true
futures.workspace = true
tikv-jemallocator.workspace = true
pprof.workspace = true
[[bench]]
name = "bench_interpret_wal"
harness = false

View File

@@ -0,0 +1,34 @@
## WAL Decoding and Interpretation Benchmarks
Note that these benchmarks pull WAL from a public bucket in S3
as a preparation step. Hence, you need a way to auth with AWS.
You can achieve this by copying the `~/.aws/config` file from
the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
the benchmarks.
To run benchmarks:
```sh
aws sso login --profile dev
# All benchmarks.
AWS_PROFILE=dev cargo bench --package wal_decoder
# Specific file.
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
# Specific benchmark.
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
# List available benchmarks.
cargo bench --package wal_decoder --benches -- --list
# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
# Output in target/criterion/*/profile/flamegraph.svg.
AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
```
Additional charts and statistics are available in `target/criterion/report/index.html`.
Benchmarks are automatically compared against the previous run. To compare against other runs, see
`--baseline` and `--save-baseline`.

View File

@@ -0,0 +1,250 @@
use anyhow::Context;
use criterion::{criterion_group, criterion_main, Criterion};
use futures::{stream::FuturesUnordered, StreamExt};
use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
use pprof::criterion::{Output, PProfProfiler};
use serde::Deserialize;
use std::{env, num::NonZeroUsize, sync::Arc};
use camino::{Utf8Path, Utf8PathBuf};
use camino_tempfile::Utf8TempDir;
use remote_storage::{
DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
S3Config,
};
use tokio_util::sync::CancellationToken;
use utils::{
lsn::Lsn,
shard::{ShardCount, ShardNumber},
};
use wal_decoder::models::InterpretedWalRecord;
const S3_BUCKET: &str = "neon-github-public-dev";
const S3_REGION: &str = "eu-central-1";
const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
const METADATA_FILENAME: &str = "metadata.json";
/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
/// This mirrors the configuration in bin/safekeeper.rs.
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[allow(non_upper_case_globals)]
#[export_name = "malloc_conf"]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
let remote_storage_config = RemoteStorageConfig {
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: S3_BUCKET.to_string(),
bucket_region: S3_REGION.to_string(),
prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: None,
upload_storage_class: None,
}),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?,
))
}
async fn download_bench_data(
client: Arc<GenericRemoteStorage>,
cancel: &CancellationToken,
) -> anyhow::Result<Utf8TempDir> {
let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
eprintln!("Downloading benchmark data to {:?}", temp_dir);
let listing = client
.list(None, ListingMode::NoDelimiter, None, cancel)
.await?;
let mut downloads = listing
.keys
.into_iter()
.map(|obj| {
let client = client.clone();
let temp_dir_path = temp_dir.path().to_owned();
async move {
let remote_path = obj.key;
let download = client
.download(&remote_path, &DownloadOpts::default(), cancel)
.await?;
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let file_name = remote_path.object_name().unwrap();
let file_path = temp_dir_path.join(file_name);
let file = tokio::fs::OpenOptions::new()
.create(true)
.truncate(true)
.write(true)
.open(&file_path)
.await?;
let mut writer = tokio::io::BufWriter::new(file);
tokio::io::copy_buf(&mut body, &mut writer).await?;
Ok::<(), anyhow::Error>(())
}
})
.collect::<FuturesUnordered<_>>();
while let Some(download) = downloads.next().await {
download?;
}
Ok(temp_dir)
}
struct BenchmarkData {
wal: Vec<u8>,
meta: BenchmarkMetadata,
}
#[derive(Deserialize)]
struct BenchmarkMetadata {
pg_version: u32,
start_lsn: Lsn,
}
async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
eprintln!("Loading benchmark data from {:?}", path);
let mut entries = tokio::fs::read_dir(path).await?;
let mut ordered_segment_paths = Vec::new();
let mut metadata = None;
while let Some(entry) = entries.next_entry().await? {
if entry.file_name() == METADATA_FILENAME {
let bytes = tokio::fs::read(entry.path()).await?;
metadata = Some(
serde_json::from_slice::<BenchmarkMetadata>(&bytes)
.context("failed to deserialize metadata.json")?,
);
} else {
ordered_segment_paths.push(entry.path());
}
}
ordered_segment_paths.sort();
let mut buffer = Vec::new();
for path in ordered_segment_paths {
if buffer.len() >= input_size {
break;
}
use async_compression::tokio::bufread::ZstdDecoder;
let file = tokio::fs::File::open(path).await?;
let reader = tokio::io::BufReader::new(file);
let decoder = ZstdDecoder::new(reader);
let mut reader = tokio::io::BufReader::new(decoder);
tokio::io::copy_buf(&mut reader, &mut buffer).await?;
}
buffer.truncate(input_size);
Ok(BenchmarkData {
wal: buffer,
meta: metadata.unwrap(),
})
}
fn criterion_benchmark(c: &mut Criterion) {
const INPUT_SIZE: usize = 128 * 1024 * 1024;
let setup_runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
let cancel = CancellationToken::new();
let client = create_s3_client().await.unwrap();
let temp_dir = download_bench_data(client, &cancel).await.unwrap();
let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
(temp_dir, bench_data)
});
eprintln!(
"Benchmarking against {} MiB of WAL",
INPUT_SIZE / 1024 / 1024
);
let mut group = c.benchmark_group("decode-interpret-wal");
group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
group.sample_size(10);
group.bench_function("unsharded", |b| {
b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
});
let eight_shards = (0..8)
.map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
.collect::<Vec<_>>();
group.bench_function("8/8-shards", |b| {
b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
});
let four_shards = eight_shards
.into_iter()
.filter(|s| s.number.0 % 2 == 0)
.collect::<Vec<_>>();
group.bench_function("4/8-shards", |b| {
b.iter(|| decode_interpret_main(&bench_data, &four_shards))
});
let two_shards = four_shards
.into_iter()
.filter(|s| s.number.0 % 4 == 0)
.collect::<Vec<_>>();
group.bench_function("2/8-shards", |b| {
b.iter(|| decode_interpret_main(&bench_data, &two_shards))
});
}
fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
let r = decode_interpret(bench, shards);
if let Err(e) = r {
panic!("{e:?}");
}
}
fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
decoder.feed_bytes(chunk);
while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
assert!(lsn.is_aligned());
let _ = InterpretedWalRecord::from_bytes_filtered(
recdata,
shard,
lsn,
bench.meta.pg_version,
)
.unwrap();
}
}
Ok(())
}
criterion_group!(
name=benches;
config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets=criterion_benchmark
);
criterion_main!(benches);

View File

@@ -1,6 +1,8 @@
//! This module contains logic for decoding and interpreting
//! raw bytes which represent a raw Postgres WAL record.
use std::collections::HashMap;
use crate::models::*;
use crate::serialized_batch::SerializedValueBatch;
use bytes::{Buf, Bytes};
@@ -14,15 +16,15 @@ use utils::lsn::Lsn;
impl InterpretedWalRecord {
/// Decode and interpreted raw bytes which represent one Postgres WAL record.
/// Data blocks which do not match the provided shard identity are filtered out.
/// Data blocks which do not match any of the provided shard identities are filtered out.
/// Shard 0 is a special case since it tracks all relation sizes. We only give it
/// the keys that are being written as that is enough for updating relation sizes.
pub fn from_bytes_filtered(
buf: Bytes,
shard: &ShardIdentity,
shards: &[ShardIdentity],
next_record_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<InterpretedWalRecord> {
) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
let mut decoded = DecodedWALRecord::default();
decode_wal_record(buf, &mut decoded, pg_version)?;
let xid = decoded.xl_xid;
@@ -33,43 +35,57 @@ impl InterpretedWalRecord {
FlushUncommittedRecords::No
};
let metadata_record =
MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
let batch = SerializedValueBatch::from_decoded_filtered(
let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
HashMap::with_capacity(shards.len());
for shard in shards {
shard_records.insert(
*shard,
InterpretedWalRecord {
metadata_record: None,
batch: SerializedValueBatch::default(),
next_record_lsn,
flush_uncommitted,
xid,
},
);
}
MetadataRecord::from_decoded_filtered(
&decoded,
&mut shard_records,
next_record_lsn,
pg_version,
)?;
SerializedValueBatch::from_decoded_filtered(
decoded,
shard,
&mut shard_records,
next_record_lsn,
pg_version,
)?;
Ok(InterpretedWalRecord {
metadata_record,
batch,
next_record_lsn,
flush_uncommitted,
xid,
})
Ok(shard_records)
}
}
impl MetadataRecord {
/// Builds a metadata record for this WAL record, if any.
/// Populates the given `shard_records` with metadata records from this WAL record, if any,
/// discarding those belonging to other shards.
///
/// Only metadata records relevant for the given shard are emitted. Currently, most metadata
/// Only metadata records relevant for the given shards is emitted. Currently, most metadata
/// records are broadcast to all shards for simplicity, but this should be improved.
fn from_decoded_filtered(
decoded: &DecodedWALRecord,
shard: &ShardIdentity,
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
next_record_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<Option<MetadataRecord>> {
) -> anyhow::Result<()> {
// Note: this doesn't actually copy the bytes since
// the [`Bytes`] type implements it via a level of indirection.
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
// First, generate metadata records from the decoded WAL record.
let mut metadata_record = match decoded.xl_rmid {
let metadata_record = match decoded.xl_rmid {
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
Self::decode_heapam_record(&mut buf, decoded, pg_version)?
}
@@ -112,41 +128,65 @@ impl MetadataRecord {
};
// Next, filter the metadata record by shard.
match metadata_record {
Some(
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
) => {
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
// of the main relation. These are sharded and managed just like regular relation pages.
// See: https://github.com/neondatabase/neon/issues/9855
let is_local_vm_page = |heap_blk| {
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
};
// Send the old and new VM page updates to their respective shards.
clear_vm_bits.old_heap_blkno = clear_vm_bits
.old_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
clear_vm_bits.new_heap_blkno = clear_vm_bits
.new_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
// If neither VM page belongs to this shard, discard the record.
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
{
metadata_record = None
for (shard, record) in shard_records.iter_mut() {
match metadata_record {
Some(
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
) => {
// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
// of the main relation. These are sharded and managed just like regular relation pages.
// See: https://github.com/neondatabase/neon/issues/9855
let is_local_vm_page = |heap_blk| {
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
};
// Send the old and new VM page updates to their respective shards.
let updated_old_heap_blkno = clear_vm_bits
.old_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
let updated_new_heap_blkno = clear_vm_bits
.new_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
// If neither VM page belongs to this shard, discard the record.
if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
// Clone the record and update it for the current shard.
let mut for_shard = metadata_record.clone();
match for_shard {
Some(
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
ref mut clear_vm_bits,
))
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
ref mut clear_vm_bits,
)),
) => {
clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
record.metadata_record = for_shard;
}
_ => {
unreachable!("for_shard is a clone of what we checked above")
}
}
}
}
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
if shard.is_shard_zero() {
record.metadata_record = metadata_record;
// No other shards should receive this record, so we stop traversing shards early.
break;
}
}
_ => {
// All other metadata records are sent to all shards.
record.metadata_record = metadata_record.clone();
}
}
Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
// Filter LogicalMessage records (AUX files) to only be stored on shard zero
if !shard.is_shard_zero() {
metadata_record = None;
}
}
_ => {}
}
Ok(metadata_record)
Ok(())
}
fn decode_heapam_record(

View File

@@ -48,7 +48,7 @@ pub mod proto {
tonic::include_proto!("interpreted_wal");
}
#[derive(Serialize, Deserialize)]
#[derive(Copy, Clone, Serialize, Deserialize)]
pub enum FlushUncommittedRecords {
Yes,
No,
@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
}
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub struct InterpretedWalRecord {
/// Optional metadata record - may cause writes to metadata keys
/// in the storage engine
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {
/// The interpreted part of the Postgres WAL record which requires metadata
/// writes to the underlying storage engine.
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum MetadataRecord {
Heapam(HeapamRecord),
Neonrmgr(NeonrmgrRecord),
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
Replorigin(ReploriginRecord),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum HeapamRecord {
ClearVmBits(ClearVmBits),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct ClearVmBits {
pub new_heap_blkno: Option<u32>,
pub old_heap_blkno: Option<u32>,
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
pub flags: u8,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum NeonrmgrRecord {
ClearVmBits(ClearVmBits),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum SmgrRecord {
Create(SmgrCreate),
Truncate(XlSmgrTruncate),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct SmgrCreate {
pub rel: RelTag,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum DbaseRecord {
Create(DbaseCreate),
Drop(DbaseDrop),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct DbaseCreate {
pub db_id: Oid,
pub tablespace_id: Oid,
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
pub src_tablespace_id: Oid,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct DbaseDrop {
pub db_id: Oid,
pub tablespace_ids: Vec<Oid>,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum ClogRecord {
ZeroPage(ClogZeroPage),
Truncate(ClogTruncate),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct ClogZeroPage {
pub segno: u32,
pub rpageno: u32,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct ClogTruncate {
pub pageno: u32,
pub oldest_xid: TransactionId,
pub oldest_xid_db: Oid,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum XactRecord {
Commit(XactCommon),
Abort(XactCommon),
@@ -200,7 +200,7 @@ pub enum XactRecord {
Prepare(XactPrepare),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct XactCommon {
pub parsed: XlXactParsedRecord,
pub origin_id: u16,
@@ -209,73 +209,73 @@ pub struct XactCommon {
pub lsn: Lsn,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct XactPrepare {
pub xl_xid: TransactionId,
pub data: Bytes,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum MultiXactRecord {
ZeroPage(MultiXactZeroPage),
Create(XlMultiXactCreate),
Truncate(XlMultiXactTruncate),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct MultiXactZeroPage {
pub slru_kind: SlruKind,
pub segno: u32,
pub rpageno: u32,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum RelmapRecord {
Update(RelmapUpdate),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct RelmapUpdate {
pub update: XlRelmapUpdate,
pub buf: Bytes,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum XlogRecord {
Raw(RawXlogRecord),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct RawXlogRecord {
pub info: u8,
pub lsn: Lsn,
pub buf: Bytes,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum LogicalMessageRecord {
Put(PutLogicalMessage),
#[cfg(feature = "testing")]
Failpoint,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PutLogicalMessage {
pub path: String,
pub buf: Bytes,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum StandbyRecord {
RunningXacts(StandbyRunningXacts),
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub struct StandbyRunningXacts {
pub oldest_running_xid: TransactionId,
}
#[derive(Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize)]
pub enum ReploriginRecord {
Set(XlReploriginSet),
Drop(XlReploriginDrop),

View File

@@ -5,7 +5,7 @@
//! Such batches are created from decoded PG wal records and ingested
//! by the pageserver by writing directly to the ephemeral file.
use std::collections::BTreeSet;
use std::collections::{BTreeSet, HashMap};
use bytes::{Bytes, BytesMut};
use pageserver_api::key::rel_block_to_key;
@@ -22,6 +22,8 @@ use utils::lsn::Lsn;
use pageserver_api::key::Key;
use crate::models::InterpretedWalRecord;
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
/// Accompanying metadata for the batch
@@ -30,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
/// relation sizes. In the case of "observed" values, we only need to know
/// the key and LSN, so two types of metadata are supported to save on network
/// bandwidth.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub enum ValueMeta {
Serialized(SerializedValueMeta),
Observed(ObservedValueMeta),
@@ -77,7 +79,7 @@ impl PartialEq for OrderedValueMeta {
impl Eq for OrderedValueMeta {}
/// Metadata for a [`Value`] serialized into the batch.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub struct SerializedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
@@ -89,14 +91,14 @@ pub struct SerializedValueMeta {
}
/// Metadata for a [`Value`] observed by the batch
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub struct ObservedValueMeta {
pub key: CompactKey,
pub lsn: Lsn,
}
/// Batch of serialized [`Value`]s.
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Clone)]
pub struct SerializedValueBatch {
/// [`Value`]s serialized in EphemeralFile's native format,
/// ready for disk write by the pageserver
@@ -128,7 +130,8 @@ impl Default for SerializedValueBatch {
}
impl SerializedValueBatch {
/// Build a batch of serialized values from a decoded PG WAL record
/// Populates the given `shard_records` with value batches from this WAL record, if any,
/// discarding those belonging to other shards.
///
/// The batch will only contain values for keys targeting the specifiec
/// shard. Shard 0 is a special case, where any keys that don't belong to
@@ -136,21 +139,20 @@ impl SerializedValueBatch {
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
pub(crate) fn from_decoded_filtered(
decoded: DecodedWALRecord,
shard: &ShardIdentity,
shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
next_record_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<SerializedValueBatch> {
// First determine how big the buffer needs to be and allocate it up-front.
) -> anyhow::Result<()> {
// First determine how big the buffers need to be and allocate it up-front.
// This duplicates some of the work below, but it's empirically much faster.
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
for (shard, record) in shard_records.iter_mut() {
assert!(record.batch.is_empty());
let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
record.batch.raw = Vec::with_capacity(estimate);
}
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
let mut max_lsn: Lsn = Lsn(0);
let mut len: usize = 0;
for blk in decoded.blocks.iter() {
let relative_off = buf.len() as u64;
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
@@ -168,99 +170,98 @@ impl SerializedValueBatch {
);
}
let key_is_local = shard.is_key_local(&key);
for (shard, record) in shard_records.iter_mut() {
let key_is_local = shard.is_key_local(&key);
tracing::debug!(
lsn=%next_record_lsn,
key=%key,
"ingest: shard decision {}",
if !key_is_local { "drop" } else { "keep" },
);
tracing::debug!(
lsn=%next_record_lsn,
key=%key,
"ingest: shard decision {}",
if !key_is_local { "drop" } else { "keep" },
);
if !key_is_local {
if shard.is_shard_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
metadata.push(ValueMeta::Observed(ObservedValueMeta {
if !key_is_local {
if shard.is_shard_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
record
.batch
.metadata
.push(ValueMeta::Observed(ObservedValueMeta {
key: key.to_compact(),
lsn: next_record_lsn,
}))
}
continue;
}
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
let val = if Self::block_is_image(&decoded, blk, pg_version) {
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
// TODO(vlad): skip the copy
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
//
// Match the logic of XLogReadBufferForRedoExtended:
// The page may be uninitialized. If so, we can't set the LSN because
// that would corrupt the page.
//
if !page_is_new(&image) {
page_set_lsn(&mut image, next_record_lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
Value::Image(image.freeze())
} else {
Value::WalRecord(NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
})
};
let relative_off = record.batch.raw.len() as u64;
val.ser_into(&mut record.batch.raw)
.expect("Writing into in-memory buffer is infallible");
let val_ser_size = record.batch.raw.len() - relative_off as usize;
record
.batch
.metadata
.push(ValueMeta::Serialized(SerializedValueMeta {
key: key.to_compact(),
lsn: next_record_lsn,
}))
}
continue;
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
}));
record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
record.batch.len += 1;
}
// Instead of storing full-page-image WAL record,
// it is better to store extracted image: we can skip wal-redo
// in this case. Also some FPI records may contain multiple (up to 32) pages,
// so them have to be copied multiple times.
//
let val = if Self::block_is_image(&decoded, blk, pg_version) {
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
let img_offs = blk.bimg_offset as usize;
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
// TODO(vlad): skip the copy
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
if blk.hole_length != 0 {
let tail = image.split_off(blk.hole_offset as usize);
image.resize(image.len() + blk.hole_length as usize, 0u8);
image.unsplit(tail);
}
//
// Match the logic of XLogReadBufferForRedoExtended:
// The page may be uninitialized. If so, we can't set the LSN because
// that would corrupt the page.
//
if !page_is_new(&image) {
page_set_lsn(&mut image, next_record_lsn)
}
assert_eq!(image.len(), BLCKSZ as usize);
Value::Image(image.freeze())
} else {
Value::WalRecord(NeonWalRecord::Postgres {
will_init: blk.will_init || blk.apply_image,
rec: decoded.record.clone(),
})
};
val.ser_into(&mut buf)
.expect("Writing into in-memory buffer is infallible");
let val_ser_size = buf.len() - relative_off as usize;
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
key: key.to_compact(),
lsn: next_record_lsn,
batch_offset: relative_off,
len: val_ser_size,
will_init: val.will_init(),
}));
max_lsn = std::cmp::max(max_lsn, next_record_lsn);
len += 1;
}
if cfg!(any(debug_assertions, test)) {
let batch = Self {
raw: buf,
metadata,
max_lsn,
len,
};
batch.validate_lsn_order();
return Ok(batch);
// Validate that the batches are correct
for record in shard_records.values() {
record.batch.validate_lsn_order();
}
}
Ok(Self {
raw: buf,
metadata,
max_lsn,
len,
})
Ok(())
}
/// Look into the decoded PG WAL record and determine