WIP: refactor for unit testing

https://github.com/neondatabase/neon/pull/8537#discussion_r1719877858
https://github.com/neondatabase/neon/pull/8537#discussion_r1719830114
2026-05-19 06:00:38 +00:00 · 2024-08-19 16:56:24 +00:00 · 2024-08-19 15:26:56 +00:00 · 2024-08-19 15:22:33 +00:00 · 2024-08-15 12:51:04 +00:00 · 2024-08-15 11:46:24 +00:00
11 changed files with 547 additions and 827 deletions
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -67,7 +67,8 @@ async fn ingest(
    let layer =
        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;

-    let data = Value::Image(Bytes::from(vec![0u8; put_size])).ser()?;
+    let value = Value::Image(Bytes::from(vec![0u8; put_size]));
+    let data = value.ser()?;
    let ctx = RequestContext::new(
        pageserver::task_mgr::TaskKind::WalReceiverConnectionHandler,
        pageserver::context::DownloadBehavior::Download,
@@ -95,7 +96,9 @@ async fn ingest(
            }
        }

-        layer.put_value(key.to_compact(), lsn, &data, &ctx).await?;
+        layer
+            .put_value(key.to_compact(), lsn, &data, value.will_init(), &ctx)
+            .await?;
    }
    layer.freeze(lsn + 1).await;

--- a/pageserver/src/l0_flush.rs
+++ b/pageserver/src/l0_flush.rs
@@ -1,15 +1,10 @@
 use std::{num::NonZeroUsize, sync::Arc};

-use crate::tenant::ephemeral_file;
-
 #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
 pub enum L0FlushConfig {
-    PageCached,
    #[serde(rename_all = "snake_case")]
-    Direct {
-        max_concurrency: NonZeroUsize,
-    },
+    Direct { max_concurrency: NonZeroUsize },
 }

 impl Default for L0FlushConfig {
@@ -25,14 +20,12 @@ impl Default for L0FlushConfig {
 pub struct L0FlushGlobalState(Arc<Inner>);

 pub enum Inner {
-    PageCached,
    Direct { semaphore: tokio::sync::Semaphore },
 }

 impl L0FlushGlobalState {
    pub fn new(config: L0FlushConfig) -> Self {
        match config {
-            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
            L0FlushConfig::Direct { max_concurrency } => {
                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
                Self(Arc::new(Inner::Direct { semaphore }))
@@ -44,13 +37,3 @@ impl L0FlushGlobalState {
        &self.0
    }
 }
-
-impl L0FlushConfig {
-    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
-        use L0FlushConfig::*;
-        match self {
-            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
-            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
-        }
-    }
-}
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,7 +2,6 @@
 //! Low-level Block-oriented I/O functions
 //!

-use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
    FileBlockReader(&'a FileBlockReader<'a>),
-    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
-    Slice(&'a [u8]),
    #[cfg(test)]
    TestDisk(&'a super::disk_btree::tests::TestDisk),
    #[cfg(test)]
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
        use BlockReaderRef::*;
        match self {
            FileBlockReader(r) => r.read_blk(blknum, ctx).await,
-            EphemeralFile(r) => r.read_blk(blknum, ctx).await,
            Adapter(r) => r.read_blk(blknum, ctx).await,
-            Slice(s) => Self::read_blk_slice(s, blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
            #[cfg(test)]
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
    }
 }

-impl<'a> BlockReaderRef<'a> {
-    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
-        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
-        let end = start.checked_add(PAGE_SZ).unwrap();
-        if end > slice.len() {
-            return Err(std::io::Error::new(
-                std::io::ErrorKind::UnexpectedEof,
-                format!("slice too short, len={} end={}", slice.len(), end),
-            ));
-        }
-        let slice = &slice[start..end];
-        let page_sized: &[u8; PAGE_SZ] = slice
-            .try_into()
-            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
-        Ok(BlockLease::Slice(page_sized))
-    }
-}
-
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -4,10 +4,16 @@
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
-use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
-use crate::virtual_file::{self, VirtualFile};
+use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
+use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
+use crate::virtual_file::owned_buffers_io::write::Buffer;
+use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
+use anyhow::Context;
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+use tracing::error;

 use std::io;
 use std::sync::atomic::AtomicU64;
@@ -16,13 +22,19 @@ use utils::id::TimelineId;
 pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
-
-    rw: page_caching::RW,
+    page_cache_file_id: page_cache::FileId,
+    bytes_written: u32,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<
+        BytesMut,
+        size_tracking_writer::Writer<VirtualFile>,
+    >,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
+    _gate_guard: utils::sync::gate::GateGuard,
 }

-mod page_caching;
-pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
-mod zero_padded_read_write;
+use super::storage_layer::inmemory_layer::InMemoryLayerIndexValue;
+
+const TAIL_SZ: usize = 64 * 1024;

 impl EphemeralFile {
    pub async fn create(
@@ -52,59 +64,199 @@ impl EphemeralFile {
        )
        .await?;

-        let prewarm = conf.l0_flush.prewarm_on_write();
+        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore

        Ok(EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file, prewarm, gate_guard),
+            page_cache_file_id,
+            bytes_written: 0,
+            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
+                size_tracking_writer::Writer::new(file),
+                BytesMut::with_capacity(TAIL_SZ),
+            ),
+            _gate_guard: gate_guard,
        })
    }
+}

-    pub(crate) fn len(&self) -> u64 {
-        self.rw.bytes_written()
+impl Drop for EphemeralFile {
+    fn drop(&mut self) {
+        // unlink the file
+        // we are clear to do this, because we have entered a gate
+        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let res = std::fs::remove_file(path);
+        if let Err(e) = res {
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!("could not remove ephemeral file '{path}': {e}");
+            }
+        }
+    }
+}
+
+impl EphemeralFile {
+    pub(crate) fn len(&self) -> u32 {
+        self.bytes_written
    }

    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.rw.page_cache_file_id()
+        self.page_cache_file_id
    }

-    /// See [`self::page_caching::RW::load_to_vec`].
    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        self.rw.load_to_vec(ctx).await
+        let size = usize::try_from(self.len()).unwrap();
+        let vec = Vec::with_capacity(size);
+
+        // read from disk what we've already flushed
+        let file_size_tracker = self.buffered_writer.as_inner();
+        let flushed_offset = usize::try_from(file_size_tracker.bytes_written()).unwrap();
+        let flushed_range = 0..flushed_offset;
+        let file: &VirtualFile = file_size_tracker.as_inner();
+        let mut vec = file
+            .read_exact_at(
+                vec.slice(0..(flushed_range.end - flushed_range.start)),
+                u64::try_from(flushed_range.start).unwrap(),
+                ctx,
+            )
+            .await?
+            .into_inner();
+
+        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
+        let buffer = self.buffered_writer.inspect_buffer();
+        let buffered = &buffer[0..buffer.pending()];
+        vec.extend_from_slice(buffered);
+        assert_eq!(vec.len(), size);
+        Ok(vec)
    }

-    pub(crate) async fn read_blk(
+    /// Fill dst will dst.bytes_total() bytes from the bytes written to the buffered writer from offset `start` and later.
+    /// If `dst` is larger than the available bytes, the read will be short.
+    /// The read will never be short for other reasons.
+    /// The number of bytes read into `dst` is returned as part of the result tuple.
+    /// No guarantees are made about the remaining bytes in `dst`, i.e., assume their contents are random.
+    pub(crate) async fn read_at_to_end<B: IoBufMut + Send>(
        &self,
-        blknum: u32,
+        start: u32,
+        dst: Slice<B>,
        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        self.rw.read_blk(blknum, ctx).await
+    ) -> std::io::Result<(Slice<B>, usize)> {
+        let file_size_tracking_writer = self.buffered_writer.as_inner();
+        let flushed_offset = u32::try_from(file_size_tracking_writer.bytes_written())
+            .expect("we don't allow writing more than u32::MAX bytes");
+
+        let buffer = self.buffered_writer.inspect_buffer();
+        let buffered = &buffer[0..buffer.pending()];
+
+        let dst_cap = u32::try_from(dst.bytes_total())
+            .with_context(|| {
+                format!(
+                    "read_aligned: dst.bytes_total() is too large: {}",
+                    dst.len()
+                )
+            })
+            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+        let end = {
+            let mut end = start
+                .checked_add(dst_cap)
+                .with_context(|| {
+                    format!("read_aligned: offset + dst.bytes_total() is too large: {start} + {dst_cap}",)
+                })
+                .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
+            if end > self.bytes_written {
+                end = self.bytes_written;
+            }
+            end
+        };
+
+        // inclusive, exclusive
+        #[derive(Debug)]
+        struct Range(u32, u32);
+        impl Range {
+            fn len(&self) -> u32 {
+                if self.0 > self.1 {
+                    0
+                } else {
+                    self.1 - self.0
+                }
+            }
+        }
+        let written_range = Range(start, std::cmp::min(end, flushed_offset));
+        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let dst = if written_range.len() > 0 {
+            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let bounds = dst.bounds();
+            let slice = file
+                .read_exact_at(
+                    dst.slice(0..written_range.len() as usize),
+                    start as u64,
+                    ctx,
+                )
+                .await?;
+            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
+        } else {
+            dst
+        };
+
+        let dst = if buffered_range.len() > 0 {
+            let offset_in_buffer =
+                usize::try_from(buffered_range.0.checked_sub(flushed_offset).unwrap()).unwrap();
+            let to_copy =
+                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len() as usize)];
+            let bounds = dst.bounds();
+            let mut view = dst.slice(
+                written_range.len() as usize
+                    ..written_range.len() as usize + buffered_range.len() as usize,
+            );
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
+        // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
+
+        Ok((dst, (end - start) as usize))
    }

    pub(crate) async fn write_blob(
        &mut self,
-        srcbuf: &[u8],
+        buf: &[u8],
+        will_init: bool,
        ctx: &RequestContext,
-    ) -> Result<u64, io::Error> {
-        let pos = self.rw.bytes_written();
+    ) -> Result<InMemoryLayerIndexValue, io::Error> {
+        let pos = self.bytes_written;
+        let len = u32::try_from(buf.len()).map_err(|e| {
+            std::io::Error::new(
+                std::io::ErrorKind::Other,
+                anyhow::anyhow!(
+                    "EphemeralFile::write_blob value too large: {}: {e}",
+                    buf.len()
+                ),
+            )
+        })?;
+        pos.checked_add(len).ok_or_else(|| {
+            io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "EphemeralFile::write_blob: overflow",
+            )
+        })?;

-        // Write the length field
-        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
+        self.buffered_writer
+            .write_buffered_borrowed(buf, ctx)
+            .await?;
+        self.bytes_written += len;

-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        } else {
-            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
-            len_buf[0] |= 0x80;
-            self.rw.write_all_borrowed(&len_buf, ctx).await?;
-        }
-
-        // Write the payload
-        self.rw.write_all_borrowed(srcbuf, ctx).await?;
-
-        Ok(pos)
+        Ok(InMemoryLayerIndexValue {
+            pos,
+            len,
+            will_init,
+        })
    }
 }

@@ -117,19 +269,11 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

-impl BlockReader for EphemeralFile {
-    fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
-        BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
-    use crate::tenant::block_io::BlockReaderRef;
-    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -160,69 +304,6 @@ mod tests {
        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }

-    #[tokio::test]
-    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
-
-        let gate = utils::sync::gate::Gate::default();
-
-        let entered = gate.enter().unwrap();
-
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
-
-        let pos_foo = file.write_blob(b"foo", &ctx).await?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor()
-                .read_blob(pos_foo, &ctx)
-                .await?
-                .as_slice()
-        );
-        let pos_bar = file.write_blob(b"bar", &ctx).await?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor()
-                .read_blob(pos_foo, &ctx)
-                .await?
-                .as_slice()
-        );
-        assert_eq!(
-            b"bar",
-            file.block_cursor()
-                .read_blob(pos_bar, &ctx)
-                .await?
-                .as_slice()
-        );
-
-        let mut blobs = Vec::new();
-        for i in 0..10000 {
-            let data = Vec::from(format!("blob{}", i).as_bytes());
-            let pos = file.write_blob(&data, &ctx).await?;
-            blobs.push((pos, data));
-        }
-        // also test with a large blobs
-        for i in 0..100 {
-            let data = format!("blob{}", i).as_bytes().repeat(100);
-            let pos = file.write_blob(&data, &ctx).await?;
-            blobs.push((pos, data));
-        }
-
-        let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
-        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos, &ctx).await?;
-            assert_eq!(actual, expected);
-        }
-
-        // Test a large blob that spans multiple pages
-        let mut large_data = vec![0; 20000];
-        thread_rng().fill_bytes(&mut large_data);
-        let pos_large = file.write_blob(&large_data, &ctx).await?;
-        let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
-        assert_eq!(result, large_data);
-
-        Ok(())
-    }
-
    #[tokio::test]
    async fn ephemeral_file_holds_gate_open() {
        const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -1,290 +0,0 @@
-//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
-//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
-
-use crate::context::RequestContext;
-use crate::page_cache::{self, PAGE_SZ};
-use crate::tenant::block_io::BlockLease;
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
-use crate::virtual_file::VirtualFile;
-
-use once_cell::sync::Lazy;
-use std::io::{self, ErrorKind};
-use std::ops::{Deref, Range};
-use tokio_epoll_uring::BoundedBuf;
-use tracing::*;
-
-use super::zero_padded_read_write;
-
-/// See module-level comment.
-pub struct RW {
-    page_cache_file_id: page_cache::FileId,
-    rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop).
-    _gate_guard: utils::sync::gate::GateGuard,
-}
-
-/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
-/// should we pre-warm the [`crate::page_cache`] with the contents?
-#[derive(Clone, Copy)]
-pub enum PrewarmOnWrite {
-    Yes,
-    No,
-}
-
-impl RW {
-    pub fn new(
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-        _gate_guard: utils::sync::gate::GateGuard,
-    ) -> Self {
-        let page_cache_file_id = page_cache::next_file_id();
-        Self {
-            page_cache_file_id,
-            rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
-                page_cache_file_id,
-                file,
-                prewarm_on_write,
-            )),
-            _gate_guard,
-        }
-    }
-
-    pub fn page_cache_file_id(&self) -> page_cache::FileId {
-        self.page_cache_file_id
-    }
-
-    pub(crate) async fn write_all_borrowed(
-        &mut self,
-        srcbuf: &[u8],
-        ctx: &RequestContext,
-    ) -> Result<usize, io::Error> {
-        // It doesn't make sense to proactively fill the page cache on the Pageserver write path
-        // because Compute is unlikely to access recently written data.
-        self.rw.write_all_borrowed(srcbuf, ctx).await
-    }
-
-    pub(crate) fn bytes_written(&self) -> u64 {
-        self.rw.bytes_written()
-    }
-
-    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
-    ///
-    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
-    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
-    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
-        // round up to the next PAGE_SZ multiple, required by blob_io
-        let size = {
-            let s = usize::try_from(self.bytes_written()).unwrap();
-            if s % PAGE_SZ == 0 {
-                s
-            } else {
-                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
-            }
-        };
-        let vec = Vec::with_capacity(size);
-
-        // read from disk what we've already flushed
-        let writer = self.rw.as_writer();
-        let flushed_range = writer.written_range();
-        let mut vec = writer
-            .file
-            .read_exact_at(
-                vec.slice(0..(flushed_range.end - flushed_range.start)),
-                u64::try_from(flushed_range.start).unwrap(),
-                ctx,
-            )
-            .await?
-            .into_inner();
-
-        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
-        let buffered = self.rw.get_tail_zero_padded();
-        vec.extend_from_slice(buffered);
-        assert_eq!(vec.len(), size);
-        assert_eq!(vec.len() % PAGE_SZ, 0);
-        Ok(vec)
-    }
-
-    pub(crate) async fn read_blk(
-        &self,
-        blknum: u32,
-        ctx: &RequestContext,
-    ) -> Result<BlockLease, io::Error> {
-        match self.rw.read_blk(blknum).await? {
-            zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
-                let cache = page_cache::get();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.rw.as_writer().file.path,
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(write_guard) => {
-                        let write_guard = writer
-                            .file
-                            .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
-                            .await?;
-                        let read_guard = write_guard.mark_valid();
-                        return Ok(BlockLease::PageReadGuard(read_guard));
-                    }
-                }
-            }
-            zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
-                Ok(BlockLease::EphemeralFileMutableTail(buffer))
-            }
-        }
-    }
-}
-
-impl Drop for RW {
-    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
-
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let res = std::fs::remove_file(&self.rw.as_writer().file.path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.rw.as_writer().file.path,
-                    e
-                );
-            }
-        }
-    }
-}
-
-struct PreWarmingWriter {
-    prewarm_on_write: PrewarmOnWrite,
-    nwritten_blocks: u32,
-    page_cache_file_id: page_cache::FileId,
-    file: VirtualFile,
-}
-
-impl PreWarmingWriter {
-    fn new(
-        page_cache_file_id: page_cache::FileId,
-        file: VirtualFile,
-        prewarm_on_write: PrewarmOnWrite,
-    ) -> Self {
-        Self {
-            prewarm_on_write,
-            nwritten_blocks: 0,
-            page_cache_file_id,
-            file,
-        }
-    }
-
-    /// Return the byte range within `file` that has been written though `write_all`.
-    ///
-    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
-    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
-        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
-        struct Wrapper(Range<usize>);
-        impl Deref for Wrapper {
-            type Target = Range<usize>;
-            fn deref(&self) -> &Range<usize> {
-                &self.0
-            }
-        }
-        Wrapper(0..nwritten_blocks * PAGE_SZ)
-    }
-}
-
-impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
-    async fn write_all<Buf: tokio_epoll_uring::IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let buflen = buf.len();
-        assert_eq!(
-            buflen % PAGE_SZ,
-            0,
-            "{buflen} ; we know TAIL_SZ is a PAGE_SZ multiple, and write_buffered_borrowed is used"
-        );
-
-        // Do the IO.
-        let buf = match self.file.write_all(buf, ctx).await {
-            (buf, Ok(nwritten)) => {
-                assert_eq!(nwritten, buflen);
-                buf
-            }
-            (_, Err(e)) => {
-                return Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    // order error before path because path is long and error is short
-                    format!(
-                        "ephemeral_file: write_blob: write-back tail self.nwritten_blocks={}, buflen={}, {:#}: {}",
-                        self.nwritten_blocks, buflen, e, self.file.path,
-                    ),
-                ));
-            }
-        };
-
-        let nblocks = buflen / PAGE_SZ;
-        let nblocks32 = u32::try_from(nblocks).unwrap();
-
-        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
-            // Pre-warm page cache with the contents.
-            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-            // benefits the code that writes InMemoryLayer=>L0 layers.
-
-            let cache = page_cache::get();
-            static CTX: Lazy<RequestContext> = Lazy::new(|| {
-                RequestContext::new(
-                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                    crate::context::DownloadBehavior::Error,
-                )
-            });
-            for blknum_in_buffer in 0..nblocks {
-                let blk_in_buffer =
-                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-                let blknum = self
-                    .nwritten_blocks
-                    .checked_add(blknum_in_buffer as u32)
-                    .unwrap();
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                    .await
-                {
-                    Err(e) => {
-                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                    }
-                    Ok(v) => match v {
-                        page_cache::ReadBufResult::Found(_guard) => {
-                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                        }
-                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                            write_guard.copy_from_slice(blk_in_buffer);
-                            let _ = write_guard.mark_valid();
-                        }
-                    },
-                }
-            }
-        }
-
-        self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
-        Ok((buflen, buf))
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -1,145 +0,0 @@
-//! The heart of how [`super::EphemeralFile`] does its reads and writes.
-//!
-//! # Writes
-//!
-//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
-//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
-//!
-//! # Reads
-//!
-//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
-//!
-//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
-//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
-//! if the read is for the prefix that has already been flushed.
-//!
-//! # Current Usage
-//!
-//! The current user of this module is [`super::page_caching::RW`].
-
-mod zero_padded;
-
-use crate::{
-    context::RequestContext,
-    page_cache::PAGE_SZ,
-    virtual_file::owned_buffers_io::{
-        self,
-        write::{Buffer, OwnedAsyncWriter},
-    },
-};
-
-const TAIL_SZ: usize = 64 * 1024;
-
-/// See module-level comment.
-pub struct RW<W: OwnedAsyncWriter> {
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        zero_padded::Buffer<TAIL_SZ>,
-        owned_buffers_io::util::size_tracking_writer::Writer<W>,
-    >,
-}
-
-pub enum ReadResult<'a, W> {
-    NeedsReadFromWriter { writer: &'a W },
-    ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
-}
-
-impl<W> RW<W>
-where
-    W: OwnedAsyncWriter,
-{
-    pub fn new(writer: W) -> Self {
-        let bytes_flushed_tracker =
-            owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
-        let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
-            bytes_flushed_tracker,
-            zero_padded::Buffer::default(),
-        );
-        Self { buffered_writer }
-    }
-
-    pub(crate) fn as_writer(&self) -> &W {
-        self.buffered_writer.as_inner().as_inner()
-    }
-
-    pub async fn write_all_borrowed(
-        &mut self,
-        buf: &[u8],
-        ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
-        self.buffered_writer.write_buffered_borrowed(buf, ctx).await
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        flushed_offset + u64::try_from(buffer.pending()).unwrap()
-    }
-
-    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
-    pub fn get_tail_zero_padded(&self) -> &[u8] {
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffer_written_up_to = buffer.pending();
-        // pad to next page boundary
-        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
-            buffer_written_up_to
-        } else {
-            buffer_written_up_to
-                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
-                .unwrap()
-        };
-        &buffer.as_zero_padded_slice()[0..read_up_to]
-    }
-
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
-        let flushed_offset = self.buffered_writer.as_inner().bytes_written();
-        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
-        let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
-        let read_offset = (blknum as u64) * (PAGE_SZ as u64);
-
-        // The trailing page ("block") might only be partially filled,
-        // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
-        // Moreover, it has to be zero-padded, because when we still had
-        // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
-        // DeltaLayer probably has the same issue, not sure why it needs no special treatment.
-        // => check here that the read doesn't go beyond this potentially trailing
-        // => the zero-padding is done in the `else` branch below
-        let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
-            buffered_offset / (PAGE_SZ as u64)
-        } else {
-            (buffered_offset / (PAGE_SZ as u64)) + 1
-        };
-        if (blknum as u64) >= blocks_written {
-            return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
-        }
-
-        // assertions for the `if-else` below
-        assert_eq!(
-            flushed_offset % (TAIL_SZ as u64), 0,
-            "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
-        );
-        assert_eq!(
-            flushed_offset % (PAGE_SZ as u64),
-            0,
-            "the logic below can't handle if the page is spread across the flushed part and the buffer"
-        );
-
-        if read_offset < flushed_offset {
-            assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
-            Ok(ReadResult::NeedsReadFromWriter {
-                writer: self.as_writer(),
-            })
-        } else {
-            let read_offset_in_buffer = read_offset
-                .checked_sub(flushed_offset)
-                .expect("would have taken `if` branch instead of this one");
-            let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
-            let zero_padded_slice = buffer.as_zero_padded_slice();
-            let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
-            Ok(ReadResult::ServedFromZeroPaddedMutableTail {
-                buffer: page
-                    .try_into()
-                    .expect("the slice above got it as page-size slice"),
-            })
-        }
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs
@@ -1,110 +0,0 @@
-//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
-//! unwritten range is guaranteed to be zero-initialized.
-//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
-//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
-
-use std::mem::MaybeUninit;
-
-use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
-
-/// See module-level comment.
-pub struct Buffer<const N: usize> {
-    allocation: Box<[u8; N]>,
-    written: usize,
-}
-
-impl<const N: usize> Default for Buffer<N> {
-    fn default() -> Self {
-        Self {
-            allocation: Box::new(
-                // SAFETY: zeroed memory is a valid [u8; N]
-                unsafe { MaybeUninit::zeroed().assume_init() },
-            ),
-            written: 0,
-        }
-    }
-}
-
-impl<const N: usize> Buffer<N> {
-    #[inline(always)]
-    fn invariants(&self) {
-        // don't check by default, unoptimized is too expensive even for debug mode
-        if false {
-            debug_assert!(self.written <= N, "{}", self.written);
-            debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
-        }
-    }
-
-    pub fn as_zero_padded_slice(&self) -> &[u8; N] {
-        &self.allocation
-    }
-}
-
-impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
-    type IoBuf = Self;
-
-    fn cap(&self) -> usize {
-        self.allocation.len()
-    }
-
-    fn extend_from_slice(&mut self, other: &[u8]) {
-        self.invariants();
-        let remaining = self.allocation.len() - self.written;
-        if other.len() > remaining {
-            panic!("calling extend_from_slice() with insufficient remaining capacity");
-        }
-        self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
-        self.written += other.len();
-        self.invariants();
-    }
-
-    fn pending(&self) -> usize {
-        self.written
-    }
-
-    fn flush(self) -> FullSlice<Self> {
-        self.invariants();
-        let written = self.written;
-        FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
-    }
-
-    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
-        let Self {
-            mut allocation,
-            written,
-        } = iobuf;
-        allocation[0..written].fill(0);
-        let new = Self {
-            allocation,
-            written: 0,
-        };
-        new.invariants();
-        new
-    }
-}
-
-/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
-/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
-///
-/// Remember that bytes_init is generally _not_ a tracker of the amount
-/// of valid data in the io buffer; we use `Slice` for that.
-/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
-///
-/// SAFETY:
-///
-/// The [`Self::allocation`] is stable becauses boxes are stable.
-/// The memory is zero-initialized, so, bytes_init is always N.
-unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
-    fn stable_ptr(&self) -> *const u8 {
-        self.allocation.as_ptr()
-    }
-
-    fn bytes_init(&self) -> usize {
-        // Yes, N, not self.written; Read the full comment of this impl block!
-        N
-    }
-
-    fn bytes_total(&self) -> usize {
-        N
-    }
-}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -64,7 +64,7 @@ use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
 use tokio::sync::OnceCell;
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::IoBuf;
 use tracing::*;

 use utils::{
@@ -458,7 +458,7 @@ impl DeltaLayerWriterInner {
        ctx: &RequestContext,
    ) -> (FullSlice<Buf>, anyhow::Result<()>)
    where
-        Buf: IoBufMut + Send,
+        Buf: IoBuf + Send,
    {
        assert!(
            self.lsn_range.start <= lsn,
@@ -666,7 +666,7 @@ impl DeltaLayerWriter {
        ctx: &RequestContext,
    ) -> (FullSlice<Buf>, anyhow::Result<()>)
    where
-        Buf: IoBufMut + Send,
+        Buf: IoBuf + Send,
    {
        self.inner
            .as_mut()
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,23 +6,24 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::{l0_flush, page_cache, walrecord};
+use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Result};
+use bytes::Bytes;
 use camino::Utf8PathBuf;
+use itertools::Itertools;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use std::collections::BTreeMap;
+use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
+use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
@@ -80,7 +81,7 @@ pub struct InMemoryLayerInner {
    /// All versions of all pages in the layer are kept here. Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
+    index: BTreeMap<CompactKey, VecMap<Lsn, InMemoryLayerIndexValue>>,

    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
@@ -89,6 +90,11 @@ pub struct InMemoryLayerInner {

    resource_units: GlobalResourceUnits,
 }
+pub(crate) struct InMemoryLayerIndexValue {
+    pub(crate) pos: u32,
+    pub(crate) len: u32,
+    pub(crate) will_init: bool, // XXX this blows up the size, can we shrink down `len`?
+}

 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -230,7 +236,7 @@ impl InMemoryLayer {
        }
    }

-    pub(crate) fn try_len(&self) -> Option<u64> {
+    pub(crate) fn try_len(&self) -> Option<u32> {
        self.inner.try_read().map(|i| i.file.len()).ok()
    }

@@ -249,9 +255,7 @@ impl InMemoryLayer {
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
-    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
-
+    pub async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let end_str = self.end_lsn_or_max();

        println!(
@@ -259,39 +263,6 @@ impl InMemoryLayer {
            self.timeline_id, self.start_lsn, end_str,
        );

-        if !verbose {
-            return Ok(());
-        }
-
-        let cursor = inner.file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, vec_map) in inner.index.iter() {
-            for (lsn, pos) in vec_map.as_slice() {
-                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                let val = Value::des(&buf);
-                match val {
-                    Ok(Value::Image(img)) => {
-                        write!(&mut desc, " img {} bytes", img.len())?;
-                    }
-                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
-                        write!(
-                            &mut desc,
-                            " rec {} bytes will_init: {} {}",
-                            buf.len(),
-                            rec.will_init(),
-                            wal_desc
-                        )?;
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
-                    }
-                }
-                println!("  key {} at {}: {}", key, lsn, desc);
-            }
-        }
-
        Ok(())
    }

@@ -311,7 +282,12 @@ impl InMemoryLayer {
            .build();

        let inner = self.inner.read().await;
-        let reader = inner.file.block_cursor();
+
+        struct ValueRead {
+            entry_lsn: Lsn,
+            read: vectored_dio_read::ValueRead<Vec<u8>>,
+        }
+        let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();

        for range in keyspace.ranges.iter() {
            for (key, vec_map) in inner
@@ -326,24 +302,53 @@ impl InMemoryLayer {

                let slice = vec_map.slice_range(lsn_range);

-                for (entry_lsn, pos) in slice.iter().rev() {
-                    // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
-                    let buf = reader.read_blob(*pos, &ctx).await;
-                    if let Err(e) = buf {
-                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                for (entry_lsn, index_value) in slice.iter().rev() {
+                    reads.entry(key).or_default().push(ValueRead {
+                        entry_lsn: *entry_lsn,
+                        read: vectored_dio_read::ValueRead::new(
+                            index_value.pos,
+                            Vec::with_capacity(index_value.len as usize),
+                        ),
+                    });
+                    if index_value.will_init {
                        break;
                    }
+                }
+            }
+        }

-                    let value = Value::des(&buf.unwrap());
-                    if let Err(e) = value {
+        // Execute the read.
+        vectored_dio_read::execute(
+            &inner.file,
+            reads
+                .iter()
+                .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
+            &ctx,
+        )
+        .await;
+
+        // Process results into the reconstruct state
+        'next_key: for (key, value_reads) in reads {
+            for ValueRead { entry_lsn, read } in value_reads {
+                match read.into_result() {
+                    Err(e) => {
                        reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
-                        break;
+                        continue 'next_key;
                    }
+                    Ok(value_buf) => {
+                        let value = Value::des(&value_buf);
+                        if let Err(e) = value {
+                            reconstruct_state
+                                .on_key_error(key, PageReconstructError::from(anyhow!(e)));
+                            continue 'next_key;
+                        }

-                    let key_situation =
-                        reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
-                    if key_situation == ValueReconstructSituation::Complete {
-                        break;
+                        let key_situation =
+                            reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
+                        if key_situation == ValueReconstructSituation::Complete {
+                            // TODO: metric to see if we fetched more values than necessary
+                            continue 'next_key;
+                        }
                    }
                }
            }
@@ -355,6 +360,17 @@ impl InMemoryLayer {
    }
 }

+impl vectored_dio_read::File for EphemeralFile {
+    async fn read_at_to_end<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
+        &'b self,
+        start: u32,
+        dst: tokio_epoll_uring::Slice<B>,
+        ctx: &'a RequestContext,
+    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
+        EphemeralFile::read_at_to_end(self, start, dst, ctx).await
+    }
+}
+
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -380,7 +396,7 @@ impl InMemoryLayer {
    /// Get layer size.
    pub async fn size(&self) -> Result<u64> {
        let inner = self.inner.read().await;
-        Ok(inner.file.len())
+        Ok(inner.file.len() as u64)
    }

    /// Create a new, empty, in-memory layer
@@ -424,11 +440,13 @@ impl InMemoryLayer {
        key: CompactKey,
        lsn: Lsn,
        buf: &[u8],
+        will_init: bool,
        ctx: &RequestContext,
    ) -> Result<()> {
        let mut inner = self.inner.write().await;
        self.assert_writable();
-        self.put_value_locked(&mut inner, key, lsn, buf, ctx).await
+        self.put_value_locked(&mut inner, key, lsn, buf, will_init, ctx)
+            .await
    }

    async fn put_value_locked(
@@ -437,31 +455,31 @@ impl InMemoryLayer {
        key: CompactKey,
        lsn: Lsn,
        buf: &[u8],
+        will_init: bool,
        ctx: &RequestContext,
    ) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);

-        let off = {
-            locked_inner
-                .file
-                .write_blob(
-                    buf,
-                    &RequestContextBuilder::extend(ctx)
-                        .page_content_kind(PageContentKind::InMemoryLayer)
-                        .build(),
-                )
-                .await?
-        };
+        let entry = locked_inner
+            .file
+            .write_blob(
+                buf,
+                will_init,
+                &RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::InMemoryLayer)
+                    .build(),
+            )
+            .await?;

        let vec_map = locked_inner.index.entry(key).or_default();
-        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
+        let old = vec_map.append_or_update_last(lsn, entry).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
            warn!("Key {} at {} already exists", key, lsn);
        }

        let size = locked_inner.file.len();
-        locked_inner.resource_units.maybe_publish_size(size);
+        locked_inner.resource_units.maybe_publish_size(size as u64);

        Ok(())
    }
@@ -473,7 +491,7 @@ impl InMemoryLayer {
    pub(crate) async fn tick(&self) -> Option<u64> {
        let mut inner = self.inner.write().await;
        let size = inner.file.len();
-        inner.resource_units.publish_size(size)
+        inner.resource_units.publish_size(size as u64)
    }

    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
@@ -536,7 +554,6 @@ impl InMemoryLayer {

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
-            Inner::PageCached => None,
            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
        };

@@ -568,76 +585,31 @@ impl InMemoryLayer {
        .await?;

        match l0_flush_global_state {
-            l0_flush::Inner::PageCached => {
-                let ctx = RequestContextBuilder::extend(ctx)
-                    .page_content_kind(PageContentKind::InMemoryLayer)
-                    .build();
-
-                let mut buf = Vec::new();
-
-                let cursor = inner.file.block_cursor();
-
-                for (key, vec_map) in inner.index.iter() {
-                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
-                            .put_value_bytes(
-                                Key::from_compact(*key),
-                                *lsn,
-                                buf.slice_len(),
-                                will_init,
-                                &ctx,
-                            )
-                            .await;
-                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
-                    }
-                }
-            }
            l0_flush::Inner::Direct { .. } => {
                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
-                assert_eq!(
-                    file_contents.len() % PAGE_SZ,
-                    0,
-                    "needed by BlockReaderRef::Slice"
-                );
-                assert_eq!(file_contents.len(), {
-                    let written = usize::try_from(inner.file.len()).unwrap();
-                    if written % PAGE_SZ == 0 {
-                        written
-                    } else {
-                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
-                    }
-                });

-                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
-
-                let mut buf = Vec::new();
+                let file_contents = Bytes::from(file_contents);

                for (key, vec_map) in inner.index.iter() {
                    // Write all page versions
-                    for (lsn, pos) in vec_map.as_slice() {
-                        // TODO: once we have blob lengths in the in-memory index, we can
-                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
-                        // 2. load the file contents into a Bytes and
-                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
-                        // 4. pass that `buf` into `put_value_bytes`
-                        // => https://github.com/neondatabase/neon/issues/8183
-                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
-                        let will_init = Value::des(&buf)?.will_init();
-                        let (tmp, res) = delta_layer_writer
+                    for (lsn, entry) in vec_map.as_slice() {
+                        let InMemoryLayerIndexValue {
+                            pos,
+                            len,
+                            will_init,
+                        } = entry;
+                        let buf =
+                            Bytes::slice(&file_contents, *pos as usize..(*pos + *len) as usize);
+                        let (_buf, res) = delta_layer_writer
                            .put_value_bytes(
                                Key::from_compact(*key),
                                *lsn,
                                buf.slice_len(),
-                                will_init,
+                                *will_init,
                                ctx,
                            )
                            .await;
                        res?;
-                        buf = tmp.into_raw_slice().into_inner();
                    }
                }
            }
@@ -659,3 +631,5 @@ impl InMemoryLayer {
        Ok(Some((desc, path)))
    }
 }
+
+mod vectored_dio_read;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -0,0 +1,244 @@
+use std::{
+    collections::BTreeMap,
+    sync::{Arc, Mutex},
+};
+
+use itertools::Itertools;
+use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
+
+use crate::context::RequestContext;
+
+pub trait File {
+    async fn read_at_to_end<'a, 'b, B: IoBufMut + Send>(
+        &'b self,
+        start: u32,
+        dst: Slice<B>,
+        ctx: &'a RequestContext,
+    ) -> std::io::Result<(Slice<B>, usize)>;
+}
+
+trait Sealed {}
+
+pub trait Buffer: Sealed + std::ops::Deref<Target = [u8]> {
+    fn cap(&self) -> usize;
+    fn len(&self) -> usize;
+    fn remaining(&self) -> usize {
+        self.cap().checked_sub(self.len()).unwrap()
+    }
+    /// Panics if the total length would exceed the initialized capacity.
+    fn extend_from_slice(&mut self, src: &[u8]);
+}
+
+pub struct ValueRead<B: Buffer> {
+    pos: u32,
+    state: MutexRefCell<Result<B, Arc<std::io::Error>>>,
+}
+
+struct MutexRefCell<T>(Mutex<T>);
+impl<T> MutexRefCell<T> {
+    fn new(value: T) -> Self {
+        Self(Mutex::new(value))
+    }
+    fn borrow(&self) -> impl std::ops::Deref<Target = T> + '_ {
+        self.0.lock().unwrap()
+    }
+    fn borrow_mut(&self) -> impl std::ops::DerefMut<Target = T> + '_ {
+        self.0.lock().unwrap()
+    }
+    fn into_inner(self) -> T {
+        self.0.into_inner().unwrap()
+    }
+}
+
+impl<B: Buffer> ValueRead<B> {
+    pub fn new(pos: u32, buf: B) -> Self {
+        Self {
+            pos,
+            state: MutexRefCell::new(Ok(buf)),
+        }
+    }
+    pub fn into_result(self) -> Result<B, Arc<std::io::Error>> {
+        self.state.into_inner()
+    }
+}
+
+impl Sealed for Vec<u8> {}
+impl Buffer for Vec<u8> {
+    fn cap(&self) -> usize {
+        self.capacity()
+    }
+
+    fn len(&self) -> usize {
+        self.len()
+    }
+
+    fn extend_from_slice(&mut self, src: &[u8]) {
+        if self.len() + src.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+        Vec::extend_from_slice(self, src);
+    }
+}
+
+pub async fn execute<'a, 'b, 'c, I, F, B>(file: &'c F, reads: I, ctx: &'b RequestContext)
+where
+    I: IntoIterator<Item = &'a ValueRead<B>> + Send,
+    F: File + Send,
+    B: Buffer + IoBufMut + Send,
+{
+    const DIO_CHUNK_SIZE: usize = 512;
+
+    // Plan which parts of which chunks need to be appended to which buffer
+    struct ChunkReadDestination<'a, B: Buffer> {
+        value_read: &'a ValueRead<B>,
+        offset_in_chunk: u32,
+        len: u32,
+    }
+    // use of BTreeMap's sorted iterator is critical to ensure buffer is filled in order
+    let mut chunk_reads: BTreeMap<u32, Vec<ChunkReadDestination<B>>> = BTreeMap::new();
+    for value_read in reads {
+        let ValueRead { pos, state } = value_read;
+        let len = state
+            .borrow()
+            .as_ref()
+            .expect("we haven't started reading, no chance it's in Err() state")
+            .len();
+        let mut remaining = usize::try_from(len).unwrap();
+        let mut chunk_no = *pos / (DIO_CHUNK_SIZE as u32);
+        let mut offset_in_chunk = usize::try_from(*pos % (DIO_CHUNK_SIZE as u32)).unwrap();
+        while remaining > 0 {
+            let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk);
+            chunk_reads
+                .entry(chunk_no)
+                .or_default()
+                .push(ChunkReadDestination {
+                    value_read,
+                    offset_in_chunk: offset_in_chunk as u32,
+                    len: remaining_in_chunk as u32,
+                });
+            offset_in_chunk = 0;
+            chunk_no += 1;
+            remaining -= remaining_in_chunk;
+        }
+    }
+
+    // Merge adjacent chunk reads (merging pass on the BTreeMap iterator)
+    const MAX_CHUNK_BATCH_SIZE: usize = {
+        let desired = 128 * 1024; // 128k
+        if desired % DIO_CHUNK_SIZE != 0 {
+            panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE")
+            // compile-time error
+        }
+        desired / DIO_CHUNK_SIZE
+    };
+    struct MergedRead<'a, B: Buffer> {
+        start_chunk_no: u32,
+        nchunks: u32,
+        dsts: Vec<MergedChunkReadDestination<'a, B>>,
+    }
+    struct MergedChunkReadDestination<'a, B: Buffer> {
+        value_read: &'a ValueRead<B>,
+        offset_in_merged_read: u32,
+        len: u32,
+    }
+    let mut merged_reads: Vec<MergedRead<B>> = Vec::new();
+    let mut chunk_reads = chunk_reads.into_iter().peekable();
+    loop {
+        let mut last_chunk_no = None;
+        let to_merge: Vec<(u32, Vec<ChunkReadDestination<B>>)> = chunk_reads
+            .peeking_take_while(|(chunk_no, _)| {
+                if let Some(last_chunk_no) = last_chunk_no {
+                    if *chunk_no != last_chunk_no + 1 {
+                        return false;
+                    }
+                }
+                last_chunk_no = Some(*chunk_no);
+                true
+            })
+            .take(MAX_CHUNK_BATCH_SIZE)
+            .collect(); // TODO: avoid this .collect()
+        let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else {
+            break;
+        };
+        let nchunks = to_merge.len() as u32;
+        let dsts = to_merge
+            .into_iter()
+            .enumerate()
+            .flat_map(|(i, (_, dsts))| {
+                dsts.into_iter().map(
+                    move |ChunkReadDestination {
+                              value_read,
+                              offset_in_chunk,
+                              len,
+                          }| {
+                        MergedChunkReadDestination {
+                            value_read,
+                            offset_in_merged_read: i as u32 * DIO_CHUNK_SIZE as u32
+                                + offset_in_chunk,
+                            len,
+                        }
+                    },
+                )
+            })
+            .collect();
+        merged_reads.push(MergedRead {
+            start_chunk_no,
+            nchunks,
+            dsts,
+        });
+    }
+    drop(chunk_reads);
+
+    // Execute reads and fill the destination
+    // TODO: prefetch
+    let get_chunk_buf = |nchunks| Vec::with_capacity(nchunks as usize * (DIO_CHUNK_SIZE));
+    for MergedRead {
+        start_chunk_no,
+        nchunks,
+        dsts,
+    } in merged_reads
+    {
+        let all_done = dsts
+            .iter()
+            .all(|MergedChunkReadDestination { value_read, .. }| {
+                value_read.state.borrow().is_err()
+            });
+        if all_done {
+            continue;
+        }
+        let (merged_read_buf_slice, nread) = match file
+            .read_at_to_end(
+                start_chunk_no * DIO_CHUNK_SIZE as u32,
+                get_chunk_buf(nchunks).slice_full(),
+                ctx,
+            )
+            .await
+        {
+            Ok(t) => t,
+            Err(e) => {
+                let e = Arc::new(e);
+                for MergedChunkReadDestination { value_read, .. } in dsts {
+                    *value_read.state.borrow_mut() = Err(Arc::clone(&e));
+                    // this will make later reads for the given ValueRead short-circuit, see top of loop body
+                }
+                continue;
+            }
+        };
+        let merged_read_buf = merged_read_buf_slice.into_inner();
+        assert_eq!(nread, merged_read_buf.len());
+        let merged_read_buf = &merged_read_buf[..nread];
+        for MergedChunkReadDestination {
+            value_read,
+            offset_in_merged_read,
+            len,
+        } in dsts
+        {
+            if let Ok(buf) = &mut *value_read.state.borrow_mut() {
+                let data = &merged_read_buf
+                    [offset_in_merged_read as usize..(offset_in_merged_read + len) as usize];
+                assert!(buf.remaining() >= data.len());
+                buf.extend_from_slice(data);
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1463,6 +1463,7 @@ impl Timeline {
            tracing::warn!("Lock conflict while reading size of open layer");
            return;
        };
+        let current_size = current_size as u64;

        let current_lsn = self.get_last_record_lsn();

@@ -5576,7 +5577,9 @@ impl<'a> TimelineWriter<'a> {

        let action = self.get_open_layer_action(lsn, buf_size);
        let layer = self.handle_open_layer_action(lsn, action, ctx).await?;
-        let res = layer.put_value(key.to_compact(), lsn, &buf, ctx).await;
+        let res = layer
+            .put_value(key.to_compact(), lsn, &buf, value.will_init(), ctx)
+            .await;

        if res.is_ok() {
            // Update the current size only when the entire write was ok.
Author	SHA1	Message	Date
Christian Schwarz	6efd9fb0a6	WIP: refactor for unit testing	2024-08-19 16:56:24 +00:00
Christian Schwarz	0d22d4d310	https://github.com/neondatabase/neon/pull/8537#discussion_r1719877858	2024-08-19 15:26:56 +00:00
Christian Schwarz	10c7419fee	https://github.com/neondatabase/neon/pull/8537#discussion_r1719830114	2024-08-19 15:22:33 +00:00
Christian Schwarz	311282762f	manually benchmark get_values_reconstruct_data performance on my M2 MacBook Pro, running a Linux VM ./target/release/neon_local stop rm -rf .neon ./target/release/neon_local init ./target/release/neon_local start ./target/release/neon_local tenant create --set-default ./target/release/neon_local endpoint create foo ./target/release/neon_local endpoint start foo psql 'postgresql://cloud_admin@127.0.0.1:55432/postgres' psql (13.16 (Debian 13.16-0+deb11u1), server 15.7) CREATE TABLE wal_test ( id SERIAL PRIMARY KEY, data TEXT ); DO $$ DECLARE i INTEGER := 1; BEGIN WHILE i <= 500000 LOOP INSERT INTO wal_test (data) VALUES ('data'); i := i + 1; END LOOP; END $$; -- => result is one L0 from initdb and one 137M-sized ephemeral-2 DO $$ DECLARE i INTEGER := 1; random_id INTEGER; random_record wal_test%ROWTYPE; start_time TIMESTAMP := clock_timestamp(); selects_completed INTEGER := 0; min_id INTEGER := 1; -- Minimum ID value max_id INTEGER := 100000; -- Maximum ID value, based on your insert range iters INTEGER := 100000000; -- Number of iterations to run BEGIN WHILE i <= iters LOOP -- Generate a random ID within the known range random_id := min_id + floor(random() * (max_id - min_id + 1))::int; -- Select the row with the generated random ID SELECT * INTO random_record FROM wal_test WHERE id = random_id; -- Increment the select counter selects_completed := selects_completed + 1; -- Check if a second has passed IF EXTRACT(EPOCH FROM clock_timestamp() - start_time) >= 1 THEN -- Print the number of selects completed in the last second RAISE NOTICE 'Selects completed in last second: %', selects_completed; -- Reset counters for the next second selects_completed := 0; start_time := clock_timestamp(); END IF; -- Increment the loop counter i := i + 1; END LOOP; END $$; ./target/release/neon_local stop baseline: commit `d9a57aeed9` origin/main, NOTICE: Selects completed in last second: 1286 NOTICE: Selects completed in last second: 1352 NOTICE: Selects completed in last second: 1365 NOTICE: Selects completed in last second: 1399 NOTICE: Selects completed in last second: 1410 NOTICE: Selects completed in last second: 1393 NOTICE: Selects completed in last second: 1316 ours NOTICE: Selects completed in last second: 1541 NOTICE: Selects completed in last second: 1536 NOTICE: Selects completed in last second: 1493 NOTICE: Selects completed in last second: 1379 NOTICE: Selects completed in last second: 1519 NOTICE: Selects completed in last second: 1546 NOTICE: Selects completed in last second: 1489 NOTICE: Selects completed in last second: 1578 NOTICE: Selects completed in last second: 1508	2024-08-15 12:51:04 +00:00
Christian Schwarz	72966e06a2	bench_ingest results (on my MBP linux VM) cargo bench --bench bench_ingest -- 'ingest 128MB/100b seq, no delta' baseline: commit `d9a57aeed9` (origin/main) ingest-small-values/ingest 128MB/100b seq, no delta time: [527.44 ms 543.79 ms 562.97 ms] thrpt: [227.36 MiB/s 235.39 MiB/s 242.68 MiB/s] HEAD~1 ingest-small-values/ingest 128MB/100b seq, no delta time: [491.37 ms 494.69 ms 498.30 ms] thrpt: [256.87 MiB/s 258.75 MiB/s 260.49 MiB/s]	2024-08-15 11:46:24 +00:00
Christian Schwarz	fb78185074	merging of adjacent chunk reads, up to max batch size	2024-08-15 11:31:24 +00:00
Christian Schwarz	6f65b4d2d3	don't think in pages, but DIO chunks; remove read_page & page_caching remnants	2024-08-15 11:28:34 +00:00
Christian Schwarz	37bfa04996	implement coalescing of multiple reads onto same page	2024-08-14 19:58:26 +00:00
Christian Schwarz	332ca2bf09	WIP	2024-08-14 19:58:26 +00:00