pageserver: optimise disk io for vectored get (#6780)

## Problem The vectored read path proposed in https://github.com/neondatabase/neon/pull/6576 seems to be functionally correct, but in my testing (see below) it is about 10-20% slower than the naive sequential vectored implementation. ## Summary of changes There's three parts to this PR: 1. Supporting vectored blob reads. This is actually trickier than it sounds because on disk blobs are prefixed with a variable length size header. Since the blobs are not necessarily fixed size, we need to juggle the offsets such that the callers can retrieve the blobs from the resulting buffer. 2. Merge disk read requests issued by the vectored read path up to a maximum size. Again, the merging is complicated by the fact that blobs are not fixed size. We keep track of the begin and end offset of each blob and pass them into the vectored blob reader. In turn, the reader will return a buffer and the offsets at which the blobs begin and end. 3. A benchmark for basebackup requests against tenant with large SLRU block counts is added. This required a small change to pagebench and a new config variable for the pageserver which toggles the vectored get validation. We can probably optimise things further by adding a little bit of concurrency for our IO. In principle, it's as simple as spawning a task which deals with issuing IO and doing the serialisation and handling on the parent task which receives input via a channel.
2025-12-22 21:59:59 +00:00 · 2024-02-28 12:06:00 +00:00
parent b6bd75964f
commit 2b11466b59
20 changed files with 1201 additions and 308 deletions
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -12,7 +12,7 @@ use std::collections::BinaryHeap;
 use std::ops::Range;
 use std::{fs, str};

-use pageserver::page_cache::PAGE_SZ;
+use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
@@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        file,
+        block_reader,
    );
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
    page_cache::init(100);
-    let file = FileBlockReader::new(VirtualFile::open(path).await?);
-    let summary_blk = file.read_blk(0, ctx).await?;
+    let file = VirtualFile::open(path).await?;
+    let file_id = page_cache::next_file_id();
+    let block_reader = FileBlockReader::new(&file, file_id);
+    let summary_blk = block_reader.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
        actual_summary.index_root_blk,
-        &file,
+        &block_reader,
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
@@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
            ctx,
        )
        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&file);
+    let cursor = BlockCursor::new_fileblockreader(&block_reader);
    for (k, v) in all {
        let value = cursor.read_blob(v.pos(), ctx).await?;
        println!("key:{} value_len:{}", k, value.len());
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -8,7 +8,7 @@ use utils::lsn::Lsn;
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{debug, info, instrument};
+use tracing::{info, instrument};

 use std::collections::HashMap;
 use std::num::NonZeroUsize;
@@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
-    #[clap(long, default_value = "localhost:64000")]
-    page_service_host_port: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    page_service_connstring: String,
    #[clap(long)]
    pageserver_jwt: Option<String>,
    #[clap(long, default_value = "1")]
@@ -230,12 +230,9 @@ async fn client(
 ) {
    start_work_barrier.wait().await;

-    let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring(
-        &args.page_service_host_port,
-        args.pageserver_jwt.as_deref(),
-    ))
-    .await
-    .unwrap();
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();

    while let Some(Work { lsn, gzip }) = work.recv().await {
        let start = Instant::now();
@@ -263,7 +260,7 @@ async fn client(
                }
            })
            .await;
-        debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
+        info!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
        let elapsed = start.elapsed();
        live_stats.inc();
        STATS.with(|stats| {
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -3,7 +3,6 @@ use utils::logging;

 /// Re-usable pieces of code that aren't CLI-specific.
 mod util {
-    pub(crate) mod connstring;
    pub(crate) mod request_stats;
    #[macro_use]
    pub(crate) mod tokio_thread_local_stats;
--- a/pageserver/pagebench/src/util/connstring.rs
+++ b/pageserver/pagebench/src/util/connstring.rs
@@ -1,8 +0,0 @@
-pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
-    let colon_and_jwt = if let Some(jwt) = jwt {
-        format!(":{jwt}") // TODO: urlescape
-    } else {
-        String::new()
-    };
-    format!("postgres://postgres{colon_and_jwt}@{host_port}")
-}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -143,6 +143,7 @@ where
    ar: &'a mut Builder<&'b mut W>,
    buf: Vec<u8>,
    current_segment: Option<(SlruKind, u32)>,
+    total_blocks: usize,
 }

 impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
@@ -154,6 +155,7 @@ where
            ar,
            buf: Vec::new(),
            current_segment: None,
+            total_blocks: 0,
        }
    }

@@ -199,7 +201,8 @@ where
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
        self.ar.append(&header, self.buf.as_slice()).await?;

-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+        self.total_blocks += nblocks;
+        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);

        self.buf.clear();

@@ -207,11 +210,15 @@ where
    }

    async fn finish(mut self) -> anyhow::Result<()> {
-        if self.current_segment.is_none() || self.buf.is_empty() {
-            return Ok(());
-        }
+        let res = if self.current_segment.is_none() || self.buf.is_empty() {
+            Ok(())
+        } else {
+            self.flush().await
+        };

-        self.flush().await
+        info!("Collected {} SLRU blocks", self.total_blocks);
+
+        res
    }
 }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -34,6 +34,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::timeline::GetVectoredImpl;
+use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
@@ -87,6 +88,10 @@ pub mod defaults {

    pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";

+    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
+
+    pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
+
    ///
    /// Default built-in configuration file.
    ///
@@ -126,6 +131,10 @@ pub mod defaults {

 #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}'

+#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}'
+
+#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -262,6 +271,10 @@ pub struct PageServerConf {
    pub virtual_file_io_engine: virtual_file::IoEngineKind,

    pub get_vectored_impl: GetVectoredImpl,
+
+    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+
+    pub validate_vectored_get: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -350,6 +363,10 @@ struct PageServerConfigBuilder {
    virtual_file_io_engine: BuilderValue<virtual_file::IoEngineKind>,

    get_vectored_impl: BuilderValue<GetVectoredImpl>,
+
+    max_vectored_read_bytes: BuilderValue<MaxVectoredReadBytes>,
+
+    validate_vectored_get: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -429,6 +446,10 @@ impl Default for PageServerConfigBuilder {
            virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()),

            get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()),
+            max_vectored_read_bytes: Set(MaxVectoredReadBytes(
+                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
+            )),
+            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
        }
    }
 }
@@ -593,6 +614,14 @@ impl PageServerConfigBuilder {
        self.get_vectored_impl = BuilderValue::Set(value);
    }

+    pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) {
+        self.max_vectored_read_bytes = BuilderValue::Set(value);
+    }
+
+    pub fn get_validate_vectored_get(&mut self, value: bool) {
+        self.validate_vectored_get = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -706,6 +735,12 @@ impl PageServerConfigBuilder {
            get_vectored_impl: self
                .get_vectored_impl
                .ok_or(anyhow!("missing get_vectored_impl"))?,
+            max_vectored_read_bytes: self
+                .max_vectored_read_bytes
+                .ok_or(anyhow!("missing max_vectored_read_bytes"))?,
+            validate_vectored_get: self
+                .validate_vectored_get
+                .ok_or(anyhow!("missing validate_vectored_get"))?,
        })
    }
 }
@@ -952,6 +987,15 @@ impl PageServerConf {
                "get_vectored_impl" => {
                    builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?)
                }
+                "max_vectored_read_bytes" => {
+                    let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize;
+                    builder.get_max_vectored_read_bytes(
+                        MaxVectoredReadBytes(
+                            NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0")))
+                }
+                "validate_vectored_get" => {
+                    builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1027,6 +1071,11 @@ impl PageServerConf {
            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
            get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+            max_vectored_read_bytes: MaxVectoredReadBytes(
+                NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                    .expect("Invalid default constant"),
+            ),
+            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
        }
    }
 }
@@ -1261,6 +1310,11 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1326,6 +1380,11 @@ background_task_maximum_delay = '334 s'
                ingest_batch_size: 100,
                virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(),
                get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(),
+                max_vectored_read_bytes: MaxVectoredReadBytes(
+                    NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
+                        .expect("Invalid default constant")
+                ),
+                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -145,6 +145,7 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
+pub mod vectored_blob_io;

 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,7 @@
 use super::ephemeral_file::EphemeralFile;
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
-use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> {
 ///
 /// Unlike traits, we also support the read function to be async though.
 pub(crate) enum BlockReaderRef<'a> {
-    FileBlockReader(&'a FileBlockReader),
+    FileBlockReader(&'a FileBlockReader<'a>),
    EphemeralFile(&'a EphemeralFile),
    Adapter(Adapter<&'a DeltaLayerInner>),
    #[cfg(test)]
@@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> {
 ///
 /// The file is assumed to be immutable. This doesn't provide any functions
 /// for modifying the file, nor for invalidating the cache if it is modified.
-pub struct FileBlockReader {
-    pub file: VirtualFile,
+pub struct FileBlockReader<'a> {
+    pub file: &'a VirtualFile,

    /// Unique ID of this file, used as key in the page cache.
    file_id: page_cache::FileId,
 }

-impl FileBlockReader {
-    pub fn new(file: VirtualFile) -> Self {
-        let file_id = page_cache::next_file_id();
-
+impl<'a> FileBlockReader<'a> {
+    pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
        FileBlockReader { file_id, file }
    }

@@ -190,11 +188,11 @@ impl FileBlockReader {
    /// Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
-    pub async fn read_blk(
+    pub async fn read_blk<'b>(
        &self,
        blknum: u32,
        ctx: &RequestContext,
-    ) -> Result<BlockLease, std::io::Error> {
+    ) -> Result<BlockLease<'b>, std::io::Error> {
        let cache = page_cache::get();
        match cache
            .read_immutable_buf(self.file_id, blknum, ctx)
@@ -215,7 +213,7 @@ impl FileBlockReader {
    }
 }

-impl BlockReader for FileBlockReader {
+impl BlockReader for FileBlockReader<'_> {
    fn block_cursor(&self) -> BlockCursor<'_> {
        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
    }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -209,8 +209,7 @@ impl Default for ValuesReconstructState {
 pub(crate) enum ReadableLayerDesc {
    Persistent {
        desc: PersistentLayerDesc,
-        lsn_floor: Lsn,
-        lsn_ceil: Lsn,
+        lsn_range: Range<Lsn>,
    },
    InMemory {
        handle: InMemoryLayerHandle,
@@ -309,14 +308,14 @@ impl Eq for ReadableLayerDescOrdered {}
 impl ReadableLayerDesc {
    pub(crate) fn get_lsn_floor(&self) -> Lsn {
        match self {
-            ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start,
            ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(),
        }
    }

    pub(crate) fn get_lsn_ceil(&self) -> Lsn {
        match self {
-            ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil,
+            ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end,
            ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil,
        }
    }
@@ -329,10 +328,15 @@ impl ReadableLayerDesc {
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
        match self {
-            ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => {
+            ReadableLayerDesc::Persistent { desc, lsn_range } => {
                let layer = layer_manager.get_from_desc(desc);
                layer
-                    .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(
+                        keyspace,
+                        lsn_range.clone(),
+                        reconstruct_state,
+                        ctx,
+                    )
                    .await
            }
            ReadableLayerDesc::InMemory { handle, lsn_ceil } => {
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,25 +29,28 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
+use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
-use std::collections::BTreeMap;
 use std::fs::File;
 use std::io::SeekFrom;
 use std::ops::Range;
@@ -63,8 +66,7 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation,
-    ValuesReconstructState,
+    AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState,
 };

 ///
@@ -214,8 +216,10 @@ pub struct DeltaLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

 impl std::fmt::Debug for DeltaLayerInner {
@@ -297,7 +301,7 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let loaded = DeltaLayerInner::load(&path, None, ctx)
+        let loaded = DeltaLayerInner::load(&path, None, None, ctx)
            .await
            .and_then(|res| res)?;

@@ -665,16 +669,16 @@ impl DeltaLayer {
    where
        F: Fn(Summary) -> Summary,
    {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
            path,
            virtual_file::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
        if actual_summary.magic != DELTA_FILE_MAGIC {
            return Err(RewriteSummaryError::MagicMismatch);
        }
@@ -698,15 +702,18 @@ impl DeltaLayerInner {
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
        let file = match VirtualFile::open(path).await {
            Ok(file) => file,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
-        let file = FileBlockReader::new(file);
+        let file_id = page_cache::next_file_id();

-        let summary_blk = match file.read_blk(0, ctx).await {
+        let block_reader = FileBlockReader::new(&file, file_id);
+
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
            Ok(blk) => blk,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
        };
@@ -730,8 +737,10 @@ impl DeltaLayerInner {

        Ok(Ok(DeltaLayerInner {
            file,
+            file_id,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
+            max_vectored_read_bytes,
        }))
    }

@@ -744,11 +753,11 @@ impl DeltaLayerInner {
    ) -> anyhow::Result<ValueReconstructResult> {
        let mut need_image = true;
        // Scan the page versions backwards, starting from `lsn`.
-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            &block_reader,
        );
        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));

@@ -782,19 +791,19 @@ impl DeltaLayerInner {
            .build();

        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
+        let cursor = block_reader.block_cursor();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
            cursor
                .read_blob_into_buf(pos, &mut buf, ctx)
                .await
                .with_context(|| {
-                    format!("Failed to read blob from virtual file {}", file.file.path)
+                    format!("Failed to read blob from virtual file {}", self.file.path)
                })?;
            let val = Value::des(&buf).with_context(|| {
                format!(
                    "Failed to deserialize file blob from virtual file {}",
-                    file.file.path
+                    self.file.path
                )
            })?;
            match val {
@@ -834,133 +843,181 @@ impl DeltaLayerInner {
    pub(super) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
+        let reads = self
+            .plan_reads(keyspace, lsn_range, reconstruct_state, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;
+
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            block_reader,
        );

-        let mut offsets: BTreeMap<Key, Vec<(Lsn, u64)>> = BTreeMap::new();
-
        for range in keyspace.ranges.iter() {
-            let mut ignore_key = None;
+            let mut range_end_handled = false;

-            // Scan the page versions backwards, starting from the last key in the range.
-            // to collect all the offsets at which need to be read.
-            let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1));
+            let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start);
            tree_reader
                .visit(
-                    &end_key.0,
-                    VisitDirection::Backwards,
+                    &start_key.0,
+                    VisitDirection::Forwards,
                    |raw_key, value| {
                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
-                        let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key);
-
-                        if entry_lsn >= end_lsn {
-                            return true;
-                        }
-
-                        if key < range.start {
-                            return false;
-                        }
-
-                        if key >= range.end {
-                            return true;
-                        }
-
-                        if Some(key) == ignore_key {
-                            return true;
-                        }
-
-                        if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) {
-                            if entry_lsn <= cached_lsn {
-                                return key != range.start;
-                            }
-                        }
-
+                        let lsn = DeltaKey::extract_lsn_from_buf(raw_key);
                        let blob_ref = BlobRef(value);
-                        let lsns_at = offsets.entry(key).or_default();
-                        lsns_at.push((entry_lsn, blob_ref.pos()));

-                        if blob_ref.will_init() {
-                            if key == range.start {
-                                return false;
+                        assert!(key >= range.start && lsn >= lsn_range.start);
+
+                        let cached_lsn = reconstruct_state.get_cached_lsn(&key);
+                        let flag = {
+                            if cached_lsn >= Some(lsn) {
+                                BlobFlag::Ignore
+                            } else if blob_ref.will_init() {
+                                BlobFlag::Replaces
                            } else {
-                                ignore_key = Some(key);
-                                return true;
+                                BlobFlag::None
                            }
-                        }
+                        };

-                        true
+                        if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) {
+                            planner.handle_range_end(blob_ref.pos());
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, lsn, blob_ref.pos(), flag);
+                            true
+                        }
                    },
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
                        .build(),
                )
                .await
-                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }
+                .map_err(|err| anyhow!(err))?;

-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::DeltaLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, lsns_at) in offsets {
-            for (lsn, block_offset) in lsns_at {
-                let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await;
-
-                if let Err(e) = res {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to read blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let value = Value::des(&buf);
-                if let Err(e) = value {
-                    reconstruct_state.on_key_error(
-                        key,
-                        PageReconstructError::from(anyhow!(e).context(format!(
-                            "Failed to deserialize file blob from virtual file {}",
-                            file.file.path
-                        ))),
-                    );
-
-                    break;
-                }
-
-                let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap());
-                if key_situation == ValueReconstructSituation::Complete {
-                    break;
-                }
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                tracing::info!("Handling range end fallback at {}", payload_end);
+                planner.handle_range_end(payload_end);
            }
        }

-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        let mut ignore_key_with_err = None;
+
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+        let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+        // Note that reads are processed in reverse order (from highest key+lsn).
+        // This is the order that `ReconstructState` requires such that it can
+        // track when a key is done.
+        for read in reads.into_iter().rev() {
+            let res = vectored_blob_reader
+                .read_blobs(&read, buf.take().expect("Should have a buffer"))
+                .await;
+
+            let blobs_buf = match res {
+                Ok(blobs_buf) => blobs_buf,
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+
+                    // We have "lost" the buffer since the lower level IO api
+                    // doesn't return the buffer on error. Allocate a new one.
+                    buf = Some(BytesMut::with_capacity(max_vectored_read_bytes));
+
+                    continue;
+                }
+            };
+
+            for meta in blobs_buf.blobs.iter().rev() {
+                if Some(meta.meta.key) == ignore_key_with_err {
+                    continue;
+                }
+
+                let value = Value::des(&blobs_buf.buf[meta.start..meta.end]);
+                let value = match value {
+                    Ok(v) => v,
+                    Err(e) => {
+                        reconstruct_state.on_key_error(
+                            meta.meta.key,
+                            PageReconstructError::from(anyhow!(e).context(format!(
+                                "Failed to deserialize blob from virtual file {}",
+                                self.file.path,
+                            ))),
+                        );
+
+                        ignore_key_with_err = Some(meta.meta.key);
+                        continue;
+                    }
+                };
+
+                // Invariant: once a key reaches [`ValueReconstructSituation::Complete`]
+                // state, no further updates shall be made to it. The call below will
+                // panic if the invariant is violated.
+                reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value);
+            }
+
+            buf = Some(blobs_buf.buf);
+        }
    }

    pub(super) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
    ) -> Result<Vec<DeltaEntry<'a>>> {
-        let file = &self.file;
-
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            block_reader,
        );

        let mut all_keys: Vec<DeltaEntry<'_>> = Vec::new();
@@ -1012,11 +1069,11 @@ impl DeltaLayerInner {
            self.index_start_blk, self.index_root_blk
        );

-        let file = &self.file;
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
            self.index_root_blk,
-            file,
+            block_reader,
        );

        tree_reader.dump().await?;
@@ -1111,7 +1168,8 @@ impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
        blknum: u32,
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum, ctx).await
+        let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id);
+        block_reader.read_blk(blknum, ctx).await
    }
 }

--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -25,7 +25,7 @@
 //! actual page images are stored in the "values" part.
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, FileId, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
@@ -34,11 +34,14 @@ use crate::tenant::storage_layer::{
    LayerAccessStats, ValueReconstructResult, ValueReconstructState,
 };
 use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::vectored_blob_io::{
+    BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
+};
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::virtual_file::{self, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::keyspace::KeySpace;
@@ -152,8 +155,10 @@ pub struct ImageLayerInner {

    lsn: Lsn,

-    /// Reader object for reading blocks from the file.
-    file: FileBlockReader,
+    file: VirtualFile,
+    file_id: FileId,
+
+    max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }

 impl std::fmt::Debug for ImageLayerInner {
@@ -167,9 +172,12 @@ impl std::fmt::Debug for ImageLayerInner {

 impl ImageLayerInner {
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
-        let file = &self.file;
-        let tree_reader =
-            DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );

        tree_reader.dump().await?;

@@ -252,7 +260,7 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx)
            .await
            .and_then(|res| res)?;

@@ -327,16 +335,16 @@ impl ImageLayer {
    where
        F: Fn(Summary) -> Summary,
    {
-        let file = VirtualFile::open_with_options(
+        let mut file = VirtualFile::open_with_options(
            path,
            virtual_file::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = block_reader.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
-        let mut file = file.file;
        if actual_summary.magic != IMAGE_FILE_MAGIC {
            return Err(RewriteSummaryError::MagicMismatch);
        }
@@ -361,14 +369,16 @@ impl ImageLayerInner {
        path: &Utf8Path,
        lsn: Lsn,
        summary: Option<Summary>,
+        max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
        ctx: &RequestContext,
    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
        let file = match VirtualFile::open(path).await {
            Ok(file) => file,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
-        let file = FileBlockReader::new(file);
-        let summary_blk = match file.read_blk(0, ctx).await {
+        let file_id = page_cache::next_file_id();
+        let block_reader = FileBlockReader::new(&file, file_id);
+        let summary_blk = match block_reader.read_blk(0, ctx).await {
            Ok(blk) => blk,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
        };
@@ -399,6 +409,8 @@ impl ImageLayerInner {
            index_root_blk: actual_summary.index_root_blk,
            lsn,
            file,
+            file_id,
+            max_vectored_read_bytes,
        }))
    }

@@ -408,8 +420,9 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader);

        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
        key.write_to_byte_slice(&mut keybuf);
@@ -422,7 +435,7 @@ impl ImageLayerInner {
            )
            .await?
        {
-            let blob = file
+            let blob = block_reader
                .block_cursor()
                .read_blob(
                    offset,
@@ -449,12 +462,36 @@ impl ImageLayerInner {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+        let reads = self
+            .plan_reads(keyspace, ctx)
+            .await
+            .map_err(GetVectoredError::Other)?;

-        let mut offsets = Vec::new();
+        self.do_reads_and_update_state(reads, reconstruct_state)
+            .await;
+
+        Ok(())
+    }
+
+    async fn plan_reads(
+        &self,
+        keyspace: KeySpace,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<VectoredRead>> {
+        let mut planner = VectoredReadPlanner::new(
+            self.max_vectored_read_bytes
+                .expect("Layer is loaded with max vectored bytes config")
+                .0
+                .into(),
+        );
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);

        for range in keyspace.ranges.iter() {
+            let mut range_end_handled = false;
+
            let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
            range.start.write_to_byte_slice(&mut search_key);

@@ -462,17 +499,18 @@ impl ImageLayerInner {
                .visit(
                    &search_key,
                    VisitDirection::Forwards,
-                    |raw_key, value| {
+                    |raw_key, offset| {
                        let key = Key::from_slice(&raw_key[..KEY_SIZE]);
                        assert!(key >= range.start);

-                        if !range.contains(&key) {
-                            return false;
+                        if key >= range.end {
+                            planner.handle_range_end(offset);
+                            range_end_handled = true;
+                            false
+                        } else {
+                            planner.handle(key, self.lsn, offset, BlobFlag::None);
+                            true
                        }
-
-                        offsets.push((key, value));
-
-                        true
                    },
                    &RequestContextBuilder::extend(ctx)
                        .page_content_kind(PageContentKind::ImageLayerBtreeNode)
@@ -480,33 +518,60 @@ impl ImageLayerInner {
                )
                .await
                .map_err(|err| GetVectoredError::Other(anyhow!(err)))?;
-        }

-        let ctx = &RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::ImageLayerValue)
-            .build();
-
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (key, offset) in offsets {
-            let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await;
-            if let Err(e) = res {
-                reconstruct_state.on_key_error(
-                    key,
-                    PageReconstructError::from(anyhow!(e).context(format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path
-                    ))),
-                );
-
-                continue;
+            if !range_end_handled {
+                let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64;
+                planner.handle_range_end(payload_end);
            }
-
-            let blob = Bytes::copy_from_slice(buf.as_slice());
-            reconstruct_state.update_key(&key, self.lsn, Value::Image(blob));
        }

-        Ok(())
+        Ok(planner.finish())
+    }
+
+    async fn do_reads_and_update_state(
+        &self,
+        reads: Vec<VectoredRead>,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) {
+        let max_vectored_read_bytes = self
+            .max_vectored_read_bytes
+            .expect("Layer is loaded with max vectored bytes config")
+            .0
+            .into();
+
+        let vectored_blob_reader = VectoredBlobReader::new(&self.file);
+        for read in reads.into_iter() {
+            let buf = BytesMut::with_capacity(max_vectored_read_bytes);
+            let res = vectored_blob_reader.read_blobs(&read, buf).await;
+
+            match res {
+                Ok(blobs_buf) => {
+                    let frozen_buf = blobs_buf.buf.freeze();
+
+                    for meta in blobs_buf.blobs.iter() {
+                        let img_buf = frozen_buf.slice(meta.start..meta.end);
+                        reconstruct_state.update_key(
+                            &meta.meta.key,
+                            self.lsn,
+                            Value::Image(img_buf),
+                        );
+                    }
+                }
+                Err(err) => {
+                    let kind = err.kind();
+                    for (_, blob_meta) in read.blobs_at.as_slice() {
+                        reconstruct_state.on_key_error(
+                            blob_meta.key,
+                            PageReconstructError::from(anyhow!(
+                                "Failed to read blobs from virtual file {}: {}",
+                                self.file.path,
+                                kind
+                            )),
+                        );
+                    }
+                }
+            };
+        }
    }
 }

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -270,7 +270,7 @@ impl Layer {
    pub(crate) async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_data: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
@@ -285,7 +285,7 @@ impl Layer {
            .record_access(LayerAccessKind::GetValueReconstructData, ctx);

        layer
-            .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx)
+            .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
            .await
    }
@@ -1296,9 +1296,14 @@ impl DownloadedLayer {
                    owner.desc.key_range.clone(),
                    owner.desc.lsn_range.clone(),
                ));
-                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Delta))
+                delta_layer::DeltaLayerInner::load(
+                    &owner.path,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
@@ -1307,9 +1312,15 @@ impl DownloadedLayer {
                    owner.desc.key_range.clone(),
                    lsn,
                ));
-                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
-                    .await
-                    .map(|res| res.map(LayerKind::Image))
+                image_layer::ImageLayerInner::load(
+                    &owner.path,
+                    lsn,
+                    summary,
+                    Some(owner.conf.max_vectored_read_bytes),
+                    ctx,
+                )
+                .await
+                .map(|res| res.map(LayerKind::Image))
            };

            match res {
@@ -1362,7 +1373,7 @@ impl DownloadedLayer {
    async fn get_values_reconstruct_data(
        &self,
        keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
        reconstruct_data: &mut ValuesReconstructState,
        owner: &Arc<LayerInner>,
        ctx: &RequestContext,
@@ -1371,7 +1382,7 @@ impl DownloadedLayer {

        match self.get(owner, ctx).await.map_err(GetVectoredError::from)? {
            Delta(d) => {
-                d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx)
+                d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx)
                    .await
            }
            Image(i) => {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -777,8 +777,10 @@ impl Timeline {
            GetVectoredImpl::Vectored => {
                let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await;

-                self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
-                    .await;
+                if self.conf.validate_vectored_get {
+                    self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx)
+                        .await;
+                }

                vectored_res
            }
@@ -2892,8 +2894,7 @@ impl Timeline {
                                (
                                    ReadableLayerDesc::Persistent {
                                        desc: (*layer).clone(),
-                                        lsn_floor,
-                                        lsn_ceil: cont_lsn,
+                                        lsn_range: lsn_floor..cont_lsn,
                                    },
                                    keyspace_accum.to_keyspace(),
                                )
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -0,0 +1,436 @@
+//!
+//! Utilities for vectored reading of variable-sized "blobs".
+//!
+//! The "blob" api is an abstraction on top of the "block" api,
+//! with the main difference being that blobs do not have a fixed
+//! size (each blob is prefixed with 1 or 4 byte length field)
+//!
+//! The vectored apis provided in this module allow for planning
+//! and executing disk IO which covers multiple blobs.
+//!
+//! Reads are planned with [`VectoredReadPlanner`] which will coalesce
+//! adjacent blocks into a single disk IO request and exectuted by
+//! [`VectoredBlobReader`] which does all the required offset juggling
+//! and returns a buffer housing all the blobs and a list of offsets.
+//!
+//! Note that the vectored blob api does *not* go through the page cache.
+
+use std::collections::BTreeMap;
+use std::num::NonZeroUsize;
+
+use bytes::BytesMut;
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+use utils::vec_map::VecMap;
+
+use crate::virtual_file::VirtualFile;
+
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub struct MaxVectoredReadBytes(pub NonZeroUsize);
+
+/// Metadata bundled with the start and end offset of a blob.
+#[derive(Copy, Clone, Debug)]
+pub struct BlobMeta {
+    pub key: Key,
+    pub lsn: Lsn,
+}
+
+/// Blob offsets into [`VectoredBlobsBuf::buf`]
+pub struct VectoredBlob {
+    pub start: usize,
+    pub end: usize,
+    pub meta: BlobMeta,
+}
+
+/// Return type of [`VectoredBlobReader::read_blobs`]
+pub struct VectoredBlobsBuf {
+    /// Buffer for all blobs in this read
+    pub buf: BytesMut,
+    /// Offsets into the buffer and metadata for all blobs in this read
+    pub blobs: Vec<VectoredBlob>,
+}
+
+/// Description of one disk read for multiple blobs.
+/// Used as the argument form [`VectoredBlobReader::read_blobs`]
+#[derive(Debug)]
+pub struct VectoredRead {
+    pub start: u64,
+    pub end: u64,
+    /// Starting offsets and metadata for each blob in this read
+    pub blobs_at: VecMap<u64, BlobMeta>,
+}
+
+impl VectoredRead {
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+}
+
+#[derive(Eq, PartialEq)]
+enum VectoredReadExtended {
+    Yes,
+    No,
+}
+
+struct VectoredReadBuilder {
+    start: u64,
+    end: u64,
+    blobs_at: VecMap<u64, BlobMeta>,
+    max_read_size: usize,
+}
+
+impl VectoredReadBuilder {
+    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+        let mut blobs_at = VecMap::default();
+        blobs_at
+            .append(start_offset, meta)
+            .expect("First insertion always succeeds");
+
+        Self {
+            start: start_offset,
+            end: end_offset,
+            blobs_at,
+            max_read_size,
+        }
+    }
+
+    /// Attempt to extend the current read with a new blob if the start
+    /// offset matches with the current end of the vectored read
+    /// and the resuting size is below the max read size
+    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        let size = (end - start) as usize;
+        if self.end == start && self.size() + size <= self.max_read_size {
+            self.end = end;
+            self.blobs_at
+                .append(start, meta)
+                .expect("LSNs are ordered within vectored reads");
+
+            return VectoredReadExtended::Yes;
+        }
+
+        VectoredReadExtended::No
+    }
+
+    fn size(&self) -> usize {
+        (self.end - self.start) as usize
+    }
+
+    fn build(self) -> VectoredRead {
+        VectoredRead {
+            start: self.start,
+            end: self.end,
+            blobs_at: self.blobs_at,
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum BlobFlag {
+    None,
+    Ignore,
+    Replaces,
+}
+
+/// Planner for vectored blob reads.
+///
+/// Blob offsets are received via [`VectoredReadPlanner::handle`]
+/// and coalesced into disk reads.
+///
+/// The implementation is very simple:
+/// * Collect all blob offsets in an ordered structure
+/// * Iterate over the collected blobs and coalesce them into reads at the end
+pub struct VectoredReadPlanner {
+    // Track all the blob offsets. Start offsets must be ordered.
+    blobs: BTreeMap<Key, Vec<(Lsn, u64, u64)>>,
+    // Arguments for previous blob passed into [`VectoredReadPlanner::handle`]
+    prev: Option<(Key, Lsn, u64, BlobFlag)>,
+
+    max_read_size: usize,
+}
+
+impl VectoredReadPlanner {
+    pub fn new(max_read_size: usize) -> Self {
+        Self {
+            blobs: BTreeMap::new(),
+            prev: None,
+            max_read_size,
+        }
+    }
+
+    /// Include a new blob in the read plan.
+    ///
+    /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads`
+    /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all
+    /// keys in a given keyspace. This function must be called for each key in the desired
+    /// keyspace (monotonically continuous). [`Self::handle_range_end`] must
+    /// be called after every range in the offset.
+    ///
+    /// In the event that keys are skipped, the behaviour is undefined and can lead to an
+    /// incorrect read plan. We can end up asserting, erroring in wal redo or returning
+    /// incorrect data to the user.
+    ///
+    /// The `flag` argument has two interesting values:
+    /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs.
+    /// This is used for WAL records that `will_init`.
+    /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens
+    /// if the blob is cached.
+    pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) {
+        // Implementation note: internally lag behind by one blob such that
+        // we have a start and end offset when initialising [`VectoredRead`]
+        let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev {
+            None => {
+                self.prev = Some((key, lsn, offset, flag));
+                return;
+            }
+            Some(prev) => prev,
+        };
+
+        self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+
+        self.prev = Some((key, lsn, offset, flag));
+    }
+
+    pub fn handle_range_end(&mut self, offset: u64) {
+        if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev {
+            self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag);
+        }
+
+        self.prev = None;
+    }
+
+    fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) {
+        match flag {
+            BlobFlag::None => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Replaces => {
+                let blobs_for_key = self.blobs.entry(key).or_default();
+                blobs_for_key.clear();
+                blobs_for_key.push((lsn, start_offset, end_offset));
+            }
+            BlobFlag::Ignore => {}
+        }
+    }
+
+    pub fn finish(self) -> Vec<VectoredRead> {
+        let mut current_read_builder: Option<VectoredReadBuilder> = None;
+        let mut reads = Vec::new();
+
+        for (key, blobs_for_key) in self.blobs {
+            for (lsn, start_offset, end_offset) in blobs_for_key {
+                let extended = match &mut current_read_builder {
+                    Some(read_builder) => {
+                        read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn })
+                    }
+                    None => VectoredReadExtended::No,
+                };
+
+                if extended == VectoredReadExtended::No {
+                    let next_read_builder = VectoredReadBuilder::new(
+                        start_offset,
+                        end_offset,
+                        BlobMeta { key, lsn },
+                        self.max_read_size,
+                    );
+
+                    let prev_read_builder = current_read_builder.replace(next_read_builder);
+
+                    // `current_read_builder` is None in the first iteration of the outer loop
+                    if let Some(read_builder) = prev_read_builder {
+                        reads.push(read_builder.build());
+                    }
+                }
+            }
+        }
+
+        if let Some(read_builder) = current_read_builder {
+            reads.push(read_builder.build());
+        }
+
+        reads
+    }
+}
+
+/// Disk reader for vectored blob spans (does not go through the page cache)
+pub struct VectoredBlobReader<'a> {
+    file: &'a VirtualFile,
+}
+
+impl<'a> VectoredBlobReader<'a> {
+    pub fn new(file: &'a VirtualFile) -> Self {
+        Self { file }
+    }
+
+    /// Read the requested blobs into the buffer.
+    ///
+    /// We have to deal with the fact that blobs are not fixed size.
+    /// Each blob is prefixed by a size header.
+    ///
+    /// The success return value is a struct which contains the buffer
+    /// filled from disk and a list of offsets at which each blob lies
+    /// in the buffer.
+    pub async fn read_blobs(
+        &self,
+        read: &VectoredRead,
+        buf: BytesMut,
+    ) -> Result<VectoredBlobsBuf, std::io::Error> {
+        assert!(read.size() > 0);
+        assert!(
+            read.size() <= buf.capacity(),
+            "{} > {}",
+            read.size(),
+            buf.capacity()
+        );
+        let buf = self
+            .file
+            .read_exact_at_n(buf, read.start, read.size())
+            .await?;
+
+        let blobs_at = read.blobs_at.as_slice();
+        let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
+
+        let mut metas = Vec::with_capacity(blobs_at.len());
+
+        // Blobs in `read` only provide their starting offset. The end offset
+        // of a blob is implicit: the start of the next blob if one exists
+        // or the end of the read.
+        let pairs = blobs_at.iter().zip(
+            blobs_at
+                .iter()
+                .map(Some)
+                .skip(1)
+                .chain(std::iter::once(None)),
+        );
+
+        for ((offset, meta), next) in pairs {
+            let offset_in_buf = offset - start_offset;
+            let first_len_byte = buf[offset_in_buf as usize];
+
+            // Each blob is prefixed by a header containing it's size.
+            // Extract the size and skip that header to find the start of the data.
+            // The size can be 1 or 4 bytes. The most significant bit is 0 in the
+            // 1 byte case and 1 in the 4 byte case.
+            let (size_length, blob_size) = if first_len_byte < 0x80 {
+                (1, first_len_byte as u64)
+            } else {
+                let mut blob_size_buf = [0u8; 4];
+                let offset_in_buf = offset_in_buf as usize;
+
+                blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
+                blob_size_buf[0] &= 0x7f;
+                (4, u32::from_be_bytes(blob_size_buf) as u64)
+            };
+
+            let start = offset_in_buf + size_length;
+            let end = match next {
+                Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
+                None => start + blob_size,
+            };
+
+            assert_eq!(end - start, blob_size);
+
+            metas.push(VectoredBlob {
+                start: start as usize,
+                end: end as usize,
+                meta: *meta,
+            })
+        }
+
+        Ok(VectoredBlobsBuf { buf, blobs: metas })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
+        assert_eq!(read.start, offset_range.first().unwrap().2);
+
+        let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
+
+        let offsets_in_read: Vec<_> = read
+            .blobs_at
+            .as_slice()
+            .iter()
+            .map(|(offset, _)| *offset)
+            .collect();
+
+        assert_eq!(expected_offsets_in_read, offsets_in_read);
+    }
+
+    #[test]
+    fn planner_max_read_size_test() {
+        let max_read_size = 128 * 1024;
+        let key = Key::MIN;
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (key, lsn, 0, BlobFlag::None),
+            (key, lsn, 32 * 1024, BlobFlag::None),
+            (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1
+            (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2
+            (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3
+            (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4
+            (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5
+            (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6
+        ];
+
+        let ranges = [
+            &blob_descriptions[0..3],
+            &blob_descriptions[3..4],
+            &blob_descriptions[4..5],
+            &blob_descriptions[5..6],
+            &blob_descriptions[6..7],
+            &blob_descriptions[7..],
+        ];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(652 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 6);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+
+    #[test]
+    fn planner_replacement_test() {
+        let max_read_size = 128 * 1024;
+        let first_key = Key::MIN;
+        let second_key = first_key.next();
+        let lsn = Lsn(0);
+
+        let blob_descriptions = vec![
+            (first_key, lsn, 0, BlobFlag::None),    // First in read 1
+            (first_key, lsn, 1024, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * 1024, BlobFlag::Replaces),
+            (second_key, lsn, 3 * 1024, BlobFlag::None),
+            (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2
+            (second_key, lsn, 5 * 1024, BlobFlag::None),     // Last in read 2
+        ];
+
+        let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
+
+        let mut planner = VectoredReadPlanner::new(max_read_size);
+        for (key, lsn, offset, flag) in blob_descriptions.clone() {
+            planner.handle(key, lsn, offset, flag);
+        }
+
+        planner.handle_range_end(6 * 1024);
+
+        let reads = planner.finish();
+        assert_eq!(reads.len(), 2);
+
+        for (idx, read) in reads.iter().enumerate() {
+            validate_read(read, ranges[idx]);
+        }
+    }
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -548,7 +548,18 @@ impl VirtualFile {
        B: IoBufMut + Send,
    {
        let (buf, res) =
-            read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await;
+            read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await;
+        res.map(|()| buf)
+    }
+
+    pub async fn read_exact_at_n<B>(&self, buf: B, offset: u64, count: usize) -> Result<B, Error>
+    where
+        B: IoBufMut + Send,
+    {
+        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
+            self.read_at(buf, offset)
+        })
+        .await;
        res.map(|()| buf)
    }

@@ -682,6 +693,7 @@ impl VirtualFile {
 pub async fn read_exact_at_impl<B, F, Fut>(
    buf: B,
    mut offset: u64,
+    count: Option<usize>,
    mut read_at: F,
 ) -> (B, std::io::Result<()>)
 where
@@ -689,7 +701,15 @@ where
    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
 {
-    let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
+    let mut buf: tokio_epoll_uring::Slice<B> = match count {
+        Some(count) => {
+            assert!(count <= buf.bytes_total());
+            assert!(count > 0);
+            buf.slice(..count) // may include uninitialized memory
+        }
+        None => buf.slice_full(), // includes all the uninitialized memory
+    };
+
    while buf.bytes_total() != 0 {
        let res;
        (buf, res) = read_at(buf, offset).await;
@@ -779,7 +799,7 @@ mod test_read_exact_at_impl {
                result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
            }]),
        }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -788,13 +808,33 @@ mod test_read_exact_at_impl {
        assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
    }

+    #[tokio::test]
+    async fn test_with_count() {
+        let buf = Vec::with_capacity(5);
+        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
+            expectations: VecDeque::from(vec![Expectation {
+                offset: 0,
+                bytes_total: 3,
+                result: Ok(vec![b'a', b'b', b'c']),
+            }]),
+        }));
+
+        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
+            let mock_read_at = Arc::clone(&mock_read_at);
+            async move { mock_read_at.lock().await.read_at(buf, offset).await }
+        })
+        .await;
+        assert!(res.is_ok());
+        assert_eq!(buf, vec![b'a', b'b', b'c']);
+    }
+
    #[tokio::test]
    async fn test_empty_buf_issues_no_syscall() {
        let buf = Vec::new();
        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
            expectations: VecDeque::new(),
        }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -819,7 +859,7 @@ mod test_read_exact_at_impl {
                },
            ]),
        }));
-        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
@@ -850,7 +890,7 @@ mod test_read_exact_at_impl {
                },
            ]),
        }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
            let mock_read_at = Arc::clone(&mock_read_at);
            async move { mock_read_at.lock().await.read_at(buf, offset).await }
        })
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1115,6 +1115,13 @@ class NeonEnv:
        # bounce through retries on startup
        self.attachment_service.start()

+        def attachment_service_ready():
+            assert self.attachment_service.ready() is True
+
+        # Wait for attachment service readiness to prevent unnecessary post start-up
+        # reconcile.
+        wait_until(30, 1, attachment_service_ready)
+
        # Start up broker, pageserver and all safekeepers
        futs = []
        with concurrent.futures.ThreadPoolExecutor(
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -0,0 +1,195 @@
+import asyncio
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)
+
+
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [10])
+@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
+@pytest.mark.timeout(1000)
+def test_basebackup_with_high_slru_count(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    get_vectored_impl: str,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs)
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; "
+        f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+
+    n_txns = 500000
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, n_txns)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+    )
+    run_benchmark(env, pg_bin, record, duration)
+
+
+def setup_tenant_template(env: NeonEnv, n_txns: int):
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+
+    ps_http = env.pageserver.http_client()
+
+    with env.endpoints.create_start(
+        "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"]
+    ) as ep:
+        rels = 10
+
+        asyncio.run(run_updates(ep, n_txns, rels))
+
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+
+    return (template_tenant, template_timeline, config)
+
+
+# Takes about 5 minutes and produces tenants with around 300 SLRU blocks
+# of 8 KiB each.
+async def run_updates(ep: Endpoint, n_txns: int, workers_count: int):
+    workers = []
+    for i in range(workers_count):
+        workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i)))
+
+    await asyncio.gather(*workers)
+
+
+async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int):
+    table = f"t_{idx}"
+    conn = await ep.connect_async()
+    await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)")
+    await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)")
+    await conn.execute(f"INSERT INTO {table} VALUES (1, 0)")
+    await conn.execute(
+        """
+         CREATE PROCEDURE updating{0}() as
+         $$
+             DECLARE
+             i integer;
+             BEGIN
+             FOR i IN 1..{1} LOOP
+                 UPDATE {0} SET x = x + 1 WHERE pk=1;
+                 COMMIT;
+             END LOOP;
+             END
+         $$ LANGUAGE plpgsql
+         """.format(table, n_txns)
+    )
+    await conn.execute("SET statement_timeout=0")
+    await conn.execute(f"call updating{table}()")
+
+
+def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int):
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "basebackup",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--gzip-probability",
+        "1",
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -3,7 +3,6 @@ import os
 from pathlib import Path
 from typing import Any, Dict, Tuple

-import fixtures.pageserver.many_tenants as many_tenants
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
@@ -15,7 +14,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.utils import get_scale_for_db, humantime_to_ms

-from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+from performance.pageserver.util import (
+    setup_pageserver_with_tenants,
+)


 # For reference, the space usage of the snapshots:
@@ -80,10 +81,77 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(

    for param, (value, kwargs) in params.items():
        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
-    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+
+    def setup_wrapper(env: NeonEnv):
+        return setup_tenant_template(env, pg_bin, pgbench_scale)
+
+    env = setup_pageserver_with_tenants(
+        neon_env_builder,
+        f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
+        n_tenants,
+        setup_wrapper,
+    )
    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)


+def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
+    """
+    Set up a template tenant which will be replicated by the test infra.
+    It's a pgbench tenant, initialized to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+    # use a config that makes production of on-disk state timing-insensitive
+    # as we ingest data into the tenant.
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "0s",  # disable periodic compaction
+        "compaction_threshold": 10,
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 3,
+    }
+    template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+    env.pageserver.tenant_attach(template_tenant, config)
+    ps_http = env.pageserver.http_client()
+    with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+        wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+        ps_http.timeline_checkpoint(template_tenant, template_timeline)
+        ps_http.timeline_compact(template_tenant, template_timeline)
+        for _ in range(
+            0, 17
+        ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+            # the L0s produced by this appear to have size ~5MiB
+            num_txns = 10_000
+            pg_bin.run_capture(
+                ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+            )
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+    # for reference, the output at scale=6 looked like so (306M total)
+    # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+    # total 306M
+    # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+    # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+    #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+    #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+    #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+    # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+    # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+    # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+    # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+    # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+    return (template_tenant, template_timeline, config)
+
+
 def run_benchmark_max_throughput_latest_lsn(
    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
 ):
@@ -138,78 +206,3 @@ def run_benchmark_max_throughput_latest_lsn(
            unit="ms",
            report=MetricReport.LOWER_IS_BETTER,
        )
-
-
-def setup_pageserver_with_pgbench_tenants(
-    neon_env_builder: NeonEnvBuilder,
-    pg_bin: PgBin,
-    n_tenants: int,
-    scale: int,
-) -> NeonEnv:
-    """
-    Utility function to set up a pageserver with a given number of identical tenants.
-    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
-    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
-    """
-
-    def setup_template(env: NeonEnv):
-        # use a config that makes production of on-disk state timing-insensitive
-        # as we ingest data into the tenant.
-        config = {
-            "gc_period": "0s",  # disable periodic gc
-            "checkpoint_timeout": "10 years",
-            "compaction_period": "0s",  # disable periodic compaction
-            "compaction_threshold": 10,
-            "compaction_target_size": 134217728,
-            "checkpoint_distance": 268435456,
-            "image_creation_threshold": 3,
-        }
-        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
-        env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
-        env.pageserver.tenant_attach(template_tenant, config)
-        ps_http = env.pageserver.http_client()
-        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
-            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
-            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-            ps_http.timeline_checkpoint(template_tenant, template_timeline)
-            ps_http.timeline_compact(template_tenant, template_timeline)
-            for _ in range(
-                0, 17
-            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
-                # the L0s produced by this appear to have size ~5MiB
-                num_txns = 10_000
-                pg_bin.run_capture(
-                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
-                )
-                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
-                ps_http.timeline_checkpoint(template_tenant, template_timeline)
-                ps_http.timeline_compact(template_tenant, template_timeline)
-        # for reference, the output at scale=6 looked like so (306M total)
-        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
-        # total 306M
-        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
-        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
-        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
-        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
-        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
-        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
-        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
-        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
-        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
-        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
-
-        return (template_tenant, template_timeline, config)
-
-    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
-        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
-
-    env = neon_env_builder.build_and_use_snapshot(
-        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
-    )
-    env.start()
-    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
-    return env
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -2,9 +2,16 @@
 Utilities used by all code in this sub-directory
 """

+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
 from fixtures.pageserver.utils import wait_until_all_tenants_state
+from fixtures.types import TenantId, TimelineId


 def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
@@ -27,3 +34,22 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
                assert not layer.remote

    log.info("ready")
+
+
+def setup_pageserver_with_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    name: str,
+    n_tenants: int,
+    setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    """
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(name, doit)
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env