pageserver: batch InMemoryLayer puts, remove need to sort items by LSN during ingest (#8591)

## Problem/Solution TimelineWriter::put_batch is simply a loop over individual puts. Each put acquires and releases locks, and checks for potentially starting a new layer. Batching these is more efficient, but more importantly unlocks future changes where we can pre-build serialized buffers much earlier in the ingest process, potentially even on the safekeeper (imagine a future model where some variant of DatadirModification lives on the safekeeper). Ensuring that the values in put_batch are written to one layer also enables a simplification upstream, where we no longer need to write values in LSN-order. This saves us a sort, but also simplifies follow-on refactors to DatadirModification: we can store metadata keys and data keys separately at that level without needing to zip them together in LSN order later. ## Why? In this PR, these changes are simplify optimizations, but they are motivated by evolving the ingest path in the direction of disentangling extracting DatadirModification from Timeline. It may not obvious how right now, but the general idea is that we'll end up with three phases of ingest: - A) Decode walrecords and build a datadirmodification with all the simple data contents already in a big serialized buffer ready to write to an ephemeral layer **<-- this part can be pipelined and parallelized, and done on a safekeeper!** - B) Let that datadirmodification see a Timeline, so that it can also generate all the metadata updates that require a read-modify-write of existing pages - C) Dump the results of B into an ephemeral layer. Related: https://github.com/neondatabase/neon/issues/8452 ## Caveats Doing a big monolithic buffer of values to write to disk is ordinarily an anti-pattern: we prefer nice streaming I/O. However: - In future, when we do this first decode stage on the safekeeper, it would be inefficient to serialize a Vec of Value, and then later deserialize it just to add blob size headers while writing into the ephemeral layer format. The idea is that for bulk write data, we will serialize exactly once. - The monolithic buffer is a stepping stone to pipelining more of this: by seriailizing earlier (rather than at the final put_value), we will be able to parallelize the wal decoding and bulk serialization of data page writes. - The ephemeral layer's buffered writer already stalls writes while it waits to flush: so while yes we'll stall for a couple milliseconds to write a couple megabytes, we already have stalls like this, just distributed across smaller writes. ## Benchmarks This PR is primarily a stepping stone to safekeeper ingest filtering, but also provides a modest efficiency improvement to the `wal_recovery` part of `test_bulk_ingest`. test_bulk_ingest: ``` test_bulk_insert[neon-release-pg16].insert: 23.659 s test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB test_bulk_insert[neon-release-pg16].peak_mem: 626 MB test_bulk_insert[neon-release-pg16].size: 0 MB test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB test_bulk_insert[neon-release-pg16].num_files_uploaded: 8 test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB test_bulk_insert[neon-release-pg16].wal_recovery: 18.981 s test_bulk_insert[neon-release-pg16].compaction: 0.055 s vs. tip of main: test_bulk_insert[neon-release-pg16].insert: 24.001 s test_bulk_insert[neon-release-pg16].pageserver_writes: 5,428 MB test_bulk_insert[neon-release-pg16].peak_mem: 604 MB test_bulk_insert[neon-release-pg16].size: 0 MB test_bulk_insert[neon-release-pg16].data_uploaded: 1,922 MB test_bulk_insert[neon-release-pg16].num_files_uploaded: 8 test_bulk_insert[neon-release-pg16].wal_written: 1,382 MB test_bulk_insert[neon-release-pg16].wal_recovery: 23.586 s test_bulk_insert[neon-release-pg16].compaction: 0.054 s ```
2025-12-25 23:29:59 +00:00 · 2024-08-22 11:04:42 +01:00
parent a968554a8c
commit 7c74112b2a
7 changed files with 247 additions and 120 deletions
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -15,12 +15,11 @@ use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::models::AuxFilePolicy;
@@ -37,7 +36,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
-use utils::vec_map::{VecMap, VecMapOrdering};
 use utils::{bin_ser::BeSer, lsn::Lsn};

 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
@@ -174,6 +172,7 @@ impl Timeline {
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
            pending_directory_entries: Vec::new(),
+            pending_bytes: 0,
            lsn,
        }
    }
@@ -1022,21 +1021,33 @@ pub struct DatadirModification<'a> {
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
    pending_lsns: Vec<Lsn>,
-    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_updates: HashMap<Key, Vec<(Lsn, usize, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+
+    /// An **approximation** of how large our EphemeralFile write will be when committed.
+    pending_bytes: usize,
 }

 impl<'a> DatadirModification<'a> {
+    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
+    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
+    // additionally specify a limit on how much payload a DatadirModification may contain before it should be committed.
+    pub(crate) const MAX_PENDING_BYTES: usize = 8 * 1024 * 1024;
+
    /// Get the current lsn
    pub(crate) fn get_lsn(&self) -> Lsn {
        self.lsn
    }

+    pub(crate) fn approx_pending_bytes(&self) -> usize {
+        self.pending_bytes
+    }
+
    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
        ensure!(
@@ -1769,21 +1780,25 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
        for (key, values) in self.pending_updates.drain() {
-            for (lsn, value) in values {
+            let mut write_batch = Vec::new();
+            for (lsn, value_ser_size, value) in values {
                if key.is_rel_block_key() || key.is_slru_block_key() {
                    // This bails out on first error without modifying pending_updates.
                    // That's Ok, cf this function's doc comment.
-                    writer.put(key, lsn, &value, ctx).await?;
+                    write_batch.push((key.to_compact(), lsn, value_ser_size, value));
                } else {
-                    retained_pending_updates
-                        .entry(key)
-                        .or_default()
-                        .push((lsn, value));
+                    retained_pending_updates.entry(key).or_default().push((
+                        lsn,
+                        value_ser_size,
+                        value,
+                    ));
                }
            }
+            writer.put_batch(write_batch, ctx).await?;
        }

        self.pending_updates = retained_pending_updates;
+        self.pending_bytes = 0;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1809,17 +1824,20 @@ impl<'a> DatadirModification<'a> {
        self.pending_nblocks = 0;

        if !self.pending_updates.is_empty() {
-            // The put_batch call below expects expects the inputs to be sorted by Lsn,
-            // so we do that first.
-            let lsn_ordered_batch: VecMap<Lsn, (Key, Value)> = VecMap::from_iter(
-                self.pending_updates
-                    .drain()
-                    .map(|(key, vals)| vals.into_iter().map(move |(lsn, val)| (lsn, (key, val))))
-                    .kmerge_by(|lhs, rhs| lhs.0 < rhs.0),
-                VecMapOrdering::GreaterOrEqual,
-            );
+            // Ordering: the items in this batch do not need to be in any global order, but values for
+            // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
+            // this to do efficient updates to its index.
+            let batch: Vec<(CompactKey, Lsn, usize, Value)> = self
+                .pending_updates
+                .drain()
+                .flat_map(|(key, values)| {
+                    values.into_iter().map(move |(lsn, val_ser_size, value)| {
+                        (key.to_compact(), lsn, val_ser_size, value)
+                    })
+                })
+                .collect::<Vec<_>>();

-            writer.put_batch(lsn_ordered_batch, ctx).await?;
+            writer.put_batch(batch, ctx).await?;
        }

        if !self.pending_deletions.is_empty() {
@@ -1844,6 +1862,8 @@ impl<'a> DatadirModification<'a> {
            writer.update_directory_entries_count(kind, count as u64);
        }

+        self.pending_bytes = 0;
+
        Ok(())
    }

@@ -1860,7 +1880,7 @@ impl<'a> DatadirModification<'a> {
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
        if let Some(values) = self.pending_updates.get(&key) {
-            if let Some((_, value)) = values.last() {
+            if let Some((_, _, value)) = values.last() {
                return if let Value::Image(img) = value {
                    Ok(img.clone())
                } else {
@@ -1888,13 +1908,17 @@ impl<'a> DatadirModification<'a> {
    fn put(&mut self, key: Key, val: Value) {
        let values = self.pending_updates.entry(key).or_default();
        // Replace the previous value if it exists at the same lsn
-        if let Some((last_lsn, last_value)) = values.last_mut() {
+        if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
            if *last_lsn == self.lsn {
+                *last_value_ser_size = val.serialized_size().unwrap() as usize;
                *last_value = val;
                return;
            }
        }
-        values.push((self.lsn, val));
+
+        let val_serialized_size = val.serialized_size().unwrap() as usize;
+        self.pending_bytes += val_serialized_size;
+        values.push((self.lsn, val_serialized_size, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {