diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 42c4e9ff48..c4c282f33d 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -12,7 +12,7 @@ use std::collections::BinaryHeap; use std::ops::Range; use std::{fs, str}; -use pageserver::page_cache::PAGE_SZ; +use pageserver::page_cache::{self, PAGE_SZ}; use pageserver::repository::{Key, KEY_SIZE}; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; @@ -100,13 +100,15 @@ pub(crate) fn parse_filename(name: &str) -> Option { // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH" async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result> { - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - file, + block_reader, ); // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 27efa6d028..be8f91675d 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -61,13 +61,15 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); page_cache::init(100); - let file = FileBlockReader::new(VirtualFile::open(path).await?); - let summary_blk = file.read_blk(0, ctx).await?; + let file = VirtualFile::open(path).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( actual_summary.index_start_blk, actual_summary.index_root_blk, - &file, + &block_reader, ); // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. let mut all = vec![]; @@ -83,7 +85,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result ctx, ) .await?; - let cursor = BlockCursor::new_fileblockreader(&file); + let cursor = BlockCursor::new_fileblockreader(&block_reader); for (k, v) in all { let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 2d61b0e252..55844be041 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -8,7 +8,7 @@ use utils::lsn::Lsn; use rand::prelude::*; use tokio::sync::Barrier; use tokio::task::JoinSet; -use tracing::{debug, info, instrument}; +use tracing::{info, instrument}; use std::collections::HashMap; use std::num::NonZeroUsize; @@ -25,8 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats}; pub(crate) struct Args { #[clap(long, default_value = "http://localhost:9898")] mgmt_api_endpoint: String, - #[clap(long, default_value = "localhost:64000")] - page_service_host_port: String, + #[clap(long, default_value = "postgres://postgres@localhost:64000")] + page_service_connstring: String, #[clap(long)] pageserver_jwt: Option, #[clap(long, default_value = "1")] @@ -230,12 +230,9 @@ async fn client( ) { start_work_barrier.wait().await; - let client = pageserver_client::page_service::Client::new(crate::util::connstring::connstring( - &args.page_service_host_port, - args.pageserver_jwt.as_deref(), - )) - .await - .unwrap(); + let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone()) + .await + .unwrap(); while let Some(Work { lsn, gzip }) = work.recv().await { let start = Instant::now(); @@ -263,7 +260,7 @@ async fn client( } }) .await; - debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); + info!("basebackup size is {} bytes", size.load(Ordering::Relaxed)); let elapsed = start.elapsed(); live_stats.inc(); STATS.with(|stats| { diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs index 9fa77f0671..5d688ed2d1 100644 --- a/pageserver/pagebench/src/main.rs +++ b/pageserver/pagebench/src/main.rs @@ -3,7 +3,6 @@ use utils::logging; /// Re-usable pieces of code that aren't CLI-specific. mod util { - pub(crate) mod connstring; pub(crate) mod request_stats; #[macro_use] pub(crate) mod tokio_thread_local_stats; diff --git a/pageserver/pagebench/src/util/connstring.rs b/pageserver/pagebench/src/util/connstring.rs deleted file mode 100644 index 07a0ff042d..0000000000 --- a/pageserver/pagebench/src/util/connstring.rs +++ /dev/null @@ -1,8 +0,0 @@ -pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String { - let colon_and_jwt = if let Some(jwt) = jwt { - format!(":{jwt}") // TODO: urlescape - } else { - String::new() - }; - format!("postgres://postgres{colon_and_jwt}@{host_port}") -} diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index c862816b80..0479d05f8f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -143,6 +143,7 @@ where ar: &'a mut Builder<&'b mut W>, buf: Vec, current_segment: Option<(SlruKind, u32)>, + total_blocks: usize, } impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W> @@ -154,6 +155,7 @@ where ar, buf: Vec::new(), current_segment: None, + total_blocks: 0, } } @@ -199,7 +201,8 @@ where let header = new_tar_header(&segname, self.buf.len() as u64)?; self.ar.append(&header, self.buf.as_slice()).await?; - trace!("Added to basebackup slru {} relsize {}", segname, nblocks); + self.total_blocks += nblocks; + debug!("Added to basebackup slru {} relsize {}", segname, nblocks); self.buf.clear(); @@ -207,11 +210,15 @@ where } async fn finish(mut self) -> anyhow::Result<()> { - if self.current_segment.is_none() || self.buf.is_empty() { - return Ok(()); - } + let res = if self.current_segment.is_none() || self.buf.is_empty() { + Ok(()) + } else { + self.flush().await + }; - self.flush().await + info!("Collected {} SLRU blocks", self.total_blocks); + + res } } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index b0d828d066..d18b8d6885 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -34,6 +34,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::timeline::GetVectoredImpl; +use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{ TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME, }; @@ -87,6 +88,10 @@ pub mod defaults { pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential"; + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; + /// /// Default built-in configuration file. /// @@ -126,6 +131,10 @@ pub mod defaults { #get_vectored_impl = '{DEFAULT_GET_VECTORED_IMPL}' +#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' + +#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}' + [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -262,6 +271,10 @@ pub struct PageServerConf { pub virtual_file_io_engine: virtual_file::IoEngineKind, pub get_vectored_impl: GetVectoredImpl, + + pub max_vectored_read_bytes: MaxVectoredReadBytes, + + pub validate_vectored_get: bool, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -350,6 +363,10 @@ struct PageServerConfigBuilder { virtual_file_io_engine: BuilderValue, get_vectored_impl: BuilderValue, + + max_vectored_read_bytes: BuilderValue, + + validate_vectored_get: BuilderValue, } impl Default for PageServerConfigBuilder { @@ -429,6 +446,10 @@ impl Default for PageServerConfigBuilder { virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), get_vectored_impl: Set(DEFAULT_GET_VECTORED_IMPL.parse().unwrap()), + max_vectored_read_bytes: Set(MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), } } } @@ -593,6 +614,14 @@ impl PageServerConfigBuilder { self.get_vectored_impl = BuilderValue::Set(value); } + pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { + self.max_vectored_read_bytes = BuilderValue::Set(value); + } + + pub fn get_validate_vectored_get(&mut self, value: bool) { + self.validate_vectored_get = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let concurrent_tenant_warmup = self .concurrent_tenant_warmup @@ -706,6 +735,12 @@ impl PageServerConfigBuilder { get_vectored_impl: self .get_vectored_impl .ok_or(anyhow!("missing get_vectored_impl"))?, + max_vectored_read_bytes: self + .max_vectored_read_bytes + .ok_or(anyhow!("missing max_vectored_read_bytes"))?, + validate_vectored_get: self + .validate_vectored_get + .ok_or(anyhow!("missing validate_vectored_get"))?, }) } } @@ -952,6 +987,15 @@ impl PageServerConf { "get_vectored_impl" => { builder.get_vectored_impl(parse_toml_from_str("get_vectored_impl", item)?) } + "max_vectored_read_bytes" => { + let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; + builder.get_max_vectored_read_bytes( + MaxVectoredReadBytes( + NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) + } + "validate_vectored_get" => { + builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1027,6 +1071,11 @@ impl PageServerConf { ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant"), + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, } } } @@ -1261,6 +1310,11 @@ background_task_maximum_delay = '334 s' ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant") + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, }, "Correct defaults should be used when no config values are provided" ); @@ -1326,6 +1380,11 @@ background_task_maximum_delay = '334 s' ingest_batch_size: 100, virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), get_vectored_impl: defaults::DEFAULT_GET_VECTORED_IMPL.parse().unwrap(), + max_vectored_read_bytes: MaxVectoredReadBytes( + NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) + .expect("Invalid default constant") + ), + validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, }, "Should be able to parse all basic config values correctly" ); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 96b78de50c..6a63a2adeb 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -145,6 +145,7 @@ macro_rules! pausable_failpoint { pub mod blob_io; pub mod block_io; +pub mod vectored_blob_io; pub mod disk_btree; pub(crate) mod ephemeral_file; diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 1b6bccc120..37c84be342 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -5,7 +5,7 @@ use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; -use crate::page_cache::{self, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; +use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; use crate::virtual_file::VirtualFile; use bytes::Bytes; use std::ops::Deref; @@ -78,7 +78,7 @@ impl<'a> Deref for BlockLease<'a> { /// /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { - FileBlockReader(&'a FileBlockReader), + FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), #[cfg(test)] @@ -160,17 +160,15 @@ impl<'a> BlockCursor<'a> { /// /// The file is assumed to be immutable. This doesn't provide any functions /// for modifying the file, nor for invalidating the cache if it is modified. -pub struct FileBlockReader { - pub file: VirtualFile, +pub struct FileBlockReader<'a> { + pub file: &'a VirtualFile, /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, } -impl FileBlockReader { - pub fn new(file: VirtualFile) -> Self { - let file_id = page_cache::next_file_id(); - +impl<'a> FileBlockReader<'a> { + pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { FileBlockReader { file_id, file } } @@ -190,11 +188,11 @@ impl FileBlockReader { /// Returns a "lease" object that can be used to /// access to the contents of the page. (For the page cache, the /// lease object represents a lock on the buffer.) - pub async fn read_blk( + pub async fn read_blk<'b>( &self, blknum: u32, ctx: &RequestContext, - ) -> Result { + ) -> Result, std::io::Error> { let cache = page_cache::get(); match cache .read_immutable_buf(self.file_id, blknum, ctx) @@ -215,7 +213,7 @@ impl FileBlockReader { } } -impl BlockReader for FileBlockReader { +impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { BlockCursor::new(BlockReaderRef::FileBlockReader(self)) } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 73c018db31..9de820912e 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -209,8 +209,7 @@ impl Default for ValuesReconstructState { pub(crate) enum ReadableLayerDesc { Persistent { desc: PersistentLayerDesc, - lsn_floor: Lsn, - lsn_ceil: Lsn, + lsn_range: Range, }, InMemory { handle: InMemoryLayerHandle, @@ -309,14 +308,14 @@ impl Eq for ReadableLayerDescOrdered {} impl ReadableLayerDesc { pub(crate) fn get_lsn_floor(&self) -> Lsn { match self { - ReadableLayerDesc::Persistent { lsn_floor, .. } => *lsn_floor, + ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start, ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(), } } pub(crate) fn get_lsn_ceil(&self) -> Lsn { match self { - ReadableLayerDesc::Persistent { lsn_ceil, .. } => *lsn_ceil, + ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end, ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil, } } @@ -329,10 +328,15 @@ impl ReadableLayerDesc { ctx: &RequestContext, ) -> Result<(), GetVectoredError> { match self { - ReadableLayerDesc::Persistent { desc, lsn_ceil, .. } => { + ReadableLayerDesc::Persistent { desc, lsn_range } => { let layer = layer_manager.get_from_desc(desc); layer - .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx) + .get_values_reconstruct_data( + keyspace, + lsn_range.clone(), + reconstruct_state, + ctx, + ) .await } ReadableLayerDesc::InMemory { handle, lsn_ceil } => { diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index e636073113..5eaf1cc1ce 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -29,25 +29,28 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; +use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; +use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; use std::fs::File; use std::io::SeekFrom; use std::ops::Range; @@ -63,8 +66,7 @@ use utils::{ }; use super::{ - AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValueReconstructSituation, - ValuesReconstructState, + AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, }; /// @@ -214,8 +216,10 @@ pub struct DeltaLayerInner { index_start_blk: u32, index_root_blk: u32, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, } impl std::fmt::Debug for DeltaLayerInner { @@ -297,7 +301,7 @@ impl DeltaLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result> { let path = self.path(); - let loaded = DeltaLayerInner::load(&path, None, ctx) + let loaded = DeltaLayerInner::load(&path, None, None, ctx) .await .and_then(|res| res)?; @@ -665,16 +669,16 @@ impl DeltaLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != DELTA_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } @@ -698,15 +702,18 @@ impl DeltaLayerInner { pub(super) async fn load( path: &Utf8Path, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { let file = match VirtualFile::open(path).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; - let file = FileBlockReader::new(file); + let file_id = page_cache::next_file_id(); - let summary_blk = match file.read_blk(0, ctx).await { + let block_reader = FileBlockReader::new(&file, file_id); + + let summary_blk = match block_reader.read_blk(0, ctx).await { Ok(blk) => blk, Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), }; @@ -730,8 +737,10 @@ impl DeltaLayerInner { Ok(Ok(DeltaLayerInner { file, + file_id, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, + max_vectored_read_bytes, })) } @@ -744,11 +753,11 @@ impl DeltaLayerInner { ) -> anyhow::Result { let mut need_image = true; // Scan the page versions backwards, starting from `lsn`. - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + &block_reader, ); let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); @@ -782,19 +791,19 @@ impl DeltaLayerInner { .build(); // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = file.block_cursor(); + let cursor = block_reader.block_cursor(); let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { cursor .read_blob_into_buf(pos, &mut buf, ctx) .await .with_context(|| { - format!("Failed to read blob from virtual file {}", file.file.path) + format!("Failed to read blob from virtual file {}", self.file.path) })?; let val = Value::des(&buf).with_context(|| { format!( "Failed to deserialize file blob from virtual file {}", - file.file.path + self.file.path ) })?; match val { @@ -834,133 +843,181 @@ impl DeltaLayerInner { pub(super) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, - end_lsn: Lsn, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let file = &self.file; + let reads = self + .plan_reads(keyspace, lsn_range, reconstruct_state, ctx) + .await + .map_err(GetVectoredError::Other)?; + + self.do_reads_and_update_state(reads, reconstruct_state) + .await; + + Ok(()) + } + + async fn plan_reads( + &self, + keyspace: KeySpace, + lsn_range: Range, + reconstruct_state: &mut ValuesReconstructState, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); - let mut offsets: BTreeMap> = BTreeMap::new(); - for range in keyspace.ranges.iter() { - let mut ignore_key = None; + let mut range_end_handled = false; - // Scan the page versions backwards, starting from the last key in the range. - // to collect all the offsets at which need to be read. - let end_key = DeltaKey::from_key_lsn(&range.end, Lsn(end_lsn.0 - 1)); + let start_key = DeltaKey::from_key_lsn(&range.start, lsn_range.start); tree_reader .visit( - &end_key.0, - VisitDirection::Backwards, + &start_key.0, + VisitDirection::Forwards, |raw_key, value| { let key = Key::from_slice(&raw_key[..KEY_SIZE]); - let entry_lsn = DeltaKey::extract_lsn_from_buf(raw_key); - - if entry_lsn >= end_lsn { - return true; - } - - if key < range.start { - return false; - } - - if key >= range.end { - return true; - } - - if Some(key) == ignore_key { - return true; - } - - if let Some(cached_lsn) = reconstruct_state.get_cached_lsn(&key) { - if entry_lsn <= cached_lsn { - return key != range.start; - } - } - + let lsn = DeltaKey::extract_lsn_from_buf(raw_key); let blob_ref = BlobRef(value); - let lsns_at = offsets.entry(key).or_default(); - lsns_at.push((entry_lsn, blob_ref.pos())); - if blob_ref.will_init() { - if key == range.start { - return false; + assert!(key >= range.start && lsn >= lsn_range.start); + + let cached_lsn = reconstruct_state.get_cached_lsn(&key); + let flag = { + if cached_lsn >= Some(lsn) { + BlobFlag::Ignore + } else if blob_ref.will_init() { + BlobFlag::Replaces } else { - ignore_key = Some(key); - return true; + BlobFlag::None } - } + }; - true + if key >= range.end || (key.next() == range.end && lsn >= lsn_range.end) { + planner.handle_range_end(blob_ref.pos()); + range_end_handled = true; + false + } else { + planner.handle(key, lsn, blob_ref.pos(), flag); + true + } }, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::DeltaLayerBtreeNode) .build(), ) .await - .map_err(|err| GetVectoredError::Other(anyhow!(err)))?; - } + .map_err(|err| anyhow!(err))?; - let ctx = &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::DeltaLayerValue) - .build(); - - let cursor = file.block_cursor(); - let mut buf = Vec::new(); - for (key, lsns_at) in offsets { - for (lsn, block_offset) in lsns_at { - let res = cursor.read_blob_into_buf(block_offset, &mut buf, ctx).await; - - if let Err(e) = res { - reconstruct_state.on_key_error( - key, - PageReconstructError::from(anyhow!(e).context(format!( - "Failed to read blob from virtual file {}", - file.file.path - ))), - ); - - break; - } - - let value = Value::des(&buf); - if let Err(e) = value { - reconstruct_state.on_key_error( - key, - PageReconstructError::from(anyhow!(e).context(format!( - "Failed to deserialize file blob from virtual file {}", - file.file.path - ))), - ); - - break; - } - - let key_situation = reconstruct_state.update_key(&key, lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - break; - } + if !range_end_handled { + let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; + tracing::info!("Handling range end fallback at {}", payload_end); + planner.handle_range_end(payload_end); } } - Ok(()) + Ok(planner.finish()) + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ) { + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + let mut ignore_key_with_err = None; + + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + let mut buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + + // Note that reads are processed in reverse order (from highest key+lsn). + // This is the order that `ReconstructState` requires such that it can + // track when a key is done. + for read in reads.into_iter().rev() { + let res = vectored_blob_reader + .read_blobs(&read, buf.take().expect("Should have a buffer")) + .await; + + let blobs_buf = match res { + Ok(blobs_buf) => blobs_buf, + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + + // We have "lost" the buffer since the lower level IO api + // doesn't return the buffer on error. Allocate a new one. + buf = Some(BytesMut::with_capacity(max_vectored_read_bytes)); + + continue; + } + }; + + for meta in blobs_buf.blobs.iter().rev() { + if Some(meta.meta.key) == ignore_key_with_err { + continue; + } + + let value = Value::des(&blobs_buf.buf[meta.start..meta.end]); + let value = match value { + Ok(v) => v, + Err(e) => { + reconstruct_state.on_key_error( + meta.meta.key, + PageReconstructError::from(anyhow!(e).context(format!( + "Failed to deserialize blob from virtual file {}", + self.file.path, + ))), + ); + + ignore_key_with_err = Some(meta.meta.key); + continue; + } + }; + + // Invariant: once a key reaches [`ValueReconstructSituation::Complete`] + // state, no further updates shall be made to it. The call below will + // panic if the invariant is violated. + reconstruct_state.update_key(&meta.meta.key, meta.meta.lsn, value); + } + + buf = Some(blobs_buf.buf); + } } pub(super) async fn load_keys<'a>( &'a self, ctx: &RequestContext, ) -> Result>> { - let file = &self.file; - + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); let mut all_keys: Vec> = Vec::new(); @@ -1012,11 +1069,11 @@ impl DeltaLayerInner { self.index_start_blk, self.index_root_blk ); - let file = &self.file; + let block_reader = FileBlockReader::new(&self.file, self.file_id); let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, - file, + block_reader, ); tree_reader.dump().await?; @@ -1111,7 +1168,8 @@ impl> Adapter { blknum: u32, ctx: &RequestContext, ) -> Result { - self.0.as_ref().file.read_blk(blknum, ctx).await + let block_reader = FileBlockReader::new(&self.0.as_ref().file, self.0.as_ref().file_id); + block_reader.read_blk(blknum, ctx).await } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index b867cb0333..0a707295cc 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -25,7 +25,7 @@ //! actual page images are stored in the "values" part. use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; +use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; @@ -34,11 +34,14 @@ use crate::tenant::storage_layer::{ LayerAccessStats, ValueReconstructResult, ValueReconstructState, }; use crate::tenant::timeline::GetVectoredError; +use crate::tenant::vectored_blob_io::{ + BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner, +}; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::{self, VirtualFile}; use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; +use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; use pageserver_api::keyspace::KeySpace; @@ -152,8 +155,10 @@ pub struct ImageLayerInner { lsn: Lsn, - /// Reader object for reading blocks from the file. - file: FileBlockReader, + file: VirtualFile, + file_id: FileId, + + max_vectored_read_bytes: Option, } impl std::fmt::Debug for ImageLayerInner { @@ -167,9 +172,12 @@ impl std::fmt::Debug for ImageLayerInner { impl ImageLayerInner { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { - let file = &self.file; - let tree_reader = - DiskBtreeReader::<_, KEY_SIZE>::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); tree_reader.dump().await?; @@ -252,7 +260,7 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx) + let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) .await .and_then(|res| res)?; @@ -327,16 +335,16 @@ impl ImageLayer { where F: Fn(Summary) -> Summary, { - let file = VirtualFile::open_with_options( + let mut file = VirtualFile::open_with_options( path, virtual_file::OpenOptions::new().read(true).write(true), ) .await .with_context(|| format!("Failed to open file '{}'", path))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0, ctx).await?; + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = block_reader.read_blk(0, ctx).await?; let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?; - let mut file = file.file; if actual_summary.magic != IMAGE_FILE_MAGIC { return Err(RewriteSummaryError::MagicMismatch); } @@ -361,14 +369,16 @@ impl ImageLayerInner { path: &Utf8Path, lsn: Lsn, summary: Option, + max_vectored_read_bytes: Option, ctx: &RequestContext, ) -> Result, anyhow::Error> { let file = match VirtualFile::open(path).await { Ok(file) => file, Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))), }; - let file = FileBlockReader::new(file); - let summary_blk = match file.read_blk(0, ctx).await { + let file_id = page_cache::next_file_id(); + let block_reader = FileBlockReader::new(&file, file_id); + let summary_blk = match block_reader.read_blk(0, ctx).await { Ok(blk) => blk, Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))), }; @@ -399,6 +409,8 @@ impl ImageLayerInner { index_root_blk: actual_summary.index_root_blk, lsn, file, + file_id, + max_vectored_read_bytes, })) } @@ -408,8 +420,9 @@ impl ImageLayerInner { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - let file = &self.file; - let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); @@ -422,7 +435,7 @@ impl ImageLayerInner { ) .await? { - let blob = file + let blob = block_reader .block_cursor() .read_blob( offset, @@ -449,12 +462,36 @@ impl ImageLayerInner { reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { - let file = &self.file; - let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); + let reads = self + .plan_reads(keyspace, ctx) + .await + .map_err(GetVectoredError::Other)?; - let mut offsets = Vec::new(); + self.do_reads_and_update_state(reads, reconstruct_state) + .await; + + Ok(()) + } + + async fn plan_reads( + &self, + keyspace: KeySpace, + ctx: &RequestContext, + ) -> anyhow::Result> { + let mut planner = VectoredReadPlanner::new( + self.max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(), + ); + + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); for range in keyspace.ranges.iter() { + let mut range_end_handled = false; + let mut search_key: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; range.start.write_to_byte_slice(&mut search_key); @@ -462,17 +499,18 @@ impl ImageLayerInner { .visit( &search_key, VisitDirection::Forwards, - |raw_key, value| { + |raw_key, offset| { let key = Key::from_slice(&raw_key[..KEY_SIZE]); assert!(key >= range.start); - if !range.contains(&key) { - return false; + if key >= range.end { + planner.handle_range_end(offset); + range_end_handled = true; + false + } else { + planner.handle(key, self.lsn, offset, BlobFlag::None); + true } - - offsets.push((key, value)); - - true }, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::ImageLayerBtreeNode) @@ -480,33 +518,60 @@ impl ImageLayerInner { ) .await .map_err(|err| GetVectoredError::Other(anyhow!(err)))?; - } - let ctx = &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::ImageLayerValue) - .build(); - - let cursor = file.block_cursor(); - let mut buf = Vec::new(); - for (key, offset) in offsets { - let res = cursor.read_blob_into_buf(offset, &mut buf, ctx).await; - if let Err(e) = res { - reconstruct_state.on_key_error( - key, - PageReconstructError::from(anyhow!(e).context(format!( - "Failed to read blob from virtual file {}", - file.file.path - ))), - ); - - continue; + if !range_end_handled { + let payload_end = self.index_start_blk as u64 * PAGE_SZ as u64; + planner.handle_range_end(payload_end); } - - let blob = Bytes::copy_from_slice(buf.as_slice()); - reconstruct_state.update_key(&key, self.lsn, Value::Image(blob)); } - Ok(()) + Ok(planner.finish()) + } + + async fn do_reads_and_update_state( + &self, + reads: Vec, + reconstruct_state: &mut ValuesReconstructState, + ) { + let max_vectored_read_bytes = self + .max_vectored_read_bytes + .expect("Layer is loaded with max vectored bytes config") + .0 + .into(); + + let vectored_blob_reader = VectoredBlobReader::new(&self.file); + for read in reads.into_iter() { + let buf = BytesMut::with_capacity(max_vectored_read_bytes); + let res = vectored_blob_reader.read_blobs(&read, buf).await; + + match res { + Ok(blobs_buf) => { + let frozen_buf = blobs_buf.buf.freeze(); + + for meta in blobs_buf.blobs.iter() { + let img_buf = frozen_buf.slice(meta.start..meta.end); + reconstruct_state.update_key( + &meta.meta.key, + self.lsn, + Value::Image(img_buf), + ); + } + } + Err(err) => { + let kind = err.kind(); + for (_, blob_meta) in read.blobs_at.as_slice() { + reconstruct_state.on_key_error( + blob_meta.key, + PageReconstructError::from(anyhow!( + "Failed to read blobs from virtual file {}: {}", + self.file.path, + kind + )), + ); + } + } + }; + } } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 61eba07be6..13c9e5c989 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -270,7 +270,7 @@ impl Layer { pub(crate) async fn get_values_reconstruct_data( &self, keyspace: KeySpace, - end_lsn: Lsn, + lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { @@ -285,7 +285,7 @@ impl Layer { .record_access(LayerAccessKind::GetValueReconstructData, ctx); layer - .get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, &self.0, ctx) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx) .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)) .await } @@ -1296,9 +1296,14 @@ impl DownloadedLayer { owner.desc.key_range.clone(), owner.desc.lsn_range.clone(), )); - delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx) - .await - .map(|res| res.map(LayerKind::Delta)) + delta_layer::DeltaLayerInner::load( + &owner.path, + summary, + Some(owner.conf.max_vectored_read_bytes), + ctx, + ) + .await + .map(|res| res.map(LayerKind::Delta)) } else { let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( @@ -1307,9 +1312,15 @@ impl DownloadedLayer { owner.desc.key_range.clone(), lsn, )); - image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx) - .await - .map(|res| res.map(LayerKind::Image)) + image_layer::ImageLayerInner::load( + &owner.path, + lsn, + summary, + Some(owner.conf.max_vectored_read_bytes), + ctx, + ) + .await + .map(|res| res.map(LayerKind::Image)) }; match res { @@ -1362,7 +1373,7 @@ impl DownloadedLayer { async fn get_values_reconstruct_data( &self, keyspace: KeySpace, - end_lsn: Lsn, + lsn_range: Range, reconstruct_data: &mut ValuesReconstructState, owner: &Arc, ctx: &RequestContext, @@ -1371,7 +1382,7 @@ impl DownloadedLayer { match self.get(owner, ctx).await.map_err(GetVectoredError::from)? { Delta(d) => { - d.get_values_reconstruct_data(keyspace, end_lsn, reconstruct_data, ctx) + d.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, ctx) .await } Image(i) => { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 4d820f7b13..fa5e7b3685 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -777,8 +777,10 @@ impl Timeline { GetVectoredImpl::Vectored => { let vectored_res = self.get_vectored_impl(keyspace.clone(), lsn, ctx).await; - self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) - .await; + if self.conf.validate_vectored_get { + self.validate_get_vectored_impl(&vectored_res, keyspace, lsn, ctx) + .await; + } vectored_res } @@ -2892,8 +2894,7 @@ impl Timeline { ( ReadableLayerDesc::Persistent { desc: (*layer).clone(), - lsn_floor, - lsn_ceil: cont_lsn, + lsn_range: lsn_floor..cont_lsn, }, keyspace_accum.to_keyspace(), ) diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs new file mode 100644 index 0000000000..a8d9649d36 --- /dev/null +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -0,0 +1,436 @@ +//! +//! Utilities for vectored reading of variable-sized "blobs". +//! +//! The "blob" api is an abstraction on top of the "block" api, +//! with the main difference being that blobs do not have a fixed +//! size (each blob is prefixed with 1 or 4 byte length field) +//! +//! The vectored apis provided in this module allow for planning +//! and executing disk IO which covers multiple blobs. +//! +//! Reads are planned with [`VectoredReadPlanner`] which will coalesce +//! adjacent blocks into a single disk IO request and exectuted by +//! [`VectoredBlobReader`] which does all the required offset juggling +//! and returns a buffer housing all the blobs and a list of offsets. +//! +//! Note that the vectored blob api does *not* go through the page cache. + +use std::collections::BTreeMap; +use std::num::NonZeroUsize; + +use bytes::BytesMut; +use pageserver_api::key::Key; +use utils::lsn::Lsn; +use utils::vec_map::VecMap; + +use crate::virtual_file::VirtualFile; + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +/// Metadata bundled with the start and end offset of a blob. +#[derive(Copy, Clone, Debug)] +pub struct BlobMeta { + pub key: Key, + pub lsn: Lsn, +} + +/// Blob offsets into [`VectoredBlobsBuf::buf`] +pub struct VectoredBlob { + pub start: usize, + pub end: usize, + pub meta: BlobMeta, +} + +/// Return type of [`VectoredBlobReader::read_blobs`] +pub struct VectoredBlobsBuf { + /// Buffer for all blobs in this read + pub buf: BytesMut, + /// Offsets into the buffer and metadata for all blobs in this read + pub blobs: Vec, +} + +/// Description of one disk read for multiple blobs. +/// Used as the argument form [`VectoredBlobReader::read_blobs`] +#[derive(Debug)] +pub struct VectoredRead { + pub start: u64, + pub end: u64, + /// Starting offsets and metadata for each blob in this read + pub blobs_at: VecMap, +} + +impl VectoredRead { + fn size(&self) -> usize { + (self.end - self.start) as usize + } +} + +#[derive(Eq, PartialEq)] +enum VectoredReadExtended { + Yes, + No, +} + +struct VectoredReadBuilder { + start: u64, + end: u64, + blobs_at: VecMap, + max_read_size: usize, +} + +impl VectoredReadBuilder { + fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + Self { + start: start_offset, + end: end_offset, + blobs_at, + max_read_size, + } + } + + /// Attempt to extend the current read with a new blob if the start + /// offset matches with the current end of the vectored read + /// and the resuting size is below the max read size + fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + let size = (end - start) as usize; + if self.end == start && self.size() + size <= self.max_read_size { + self.end = end; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + fn size(&self) -> usize { + (self.end - self.start) as usize + } + + fn build(self) -> VectoredRead { + VectoredRead { + start: self.start, + end: self.end, + blobs_at: self.blobs_at, + } + } +} + +#[derive(Copy, Clone, Debug)] +pub enum BlobFlag { + None, + Ignore, + Replaces, +} + +/// Planner for vectored blob reads. +/// +/// Blob offsets are received via [`VectoredReadPlanner::handle`] +/// and coalesced into disk reads. +/// +/// The implementation is very simple: +/// * Collect all blob offsets in an ordered structure +/// * Iterate over the collected blobs and coalesce them into reads at the end +pub struct VectoredReadPlanner { + // Track all the blob offsets. Start offsets must be ordered. + blobs: BTreeMap>, + // Arguments for previous blob passed into [`VectoredReadPlanner::handle`] + prev: Option<(Key, Lsn, u64, BlobFlag)>, + + max_read_size: usize, +} + +impl VectoredReadPlanner { + pub fn new(max_read_size: usize) -> Self { + Self { + blobs: BTreeMap::new(), + prev: None, + max_read_size, + } + } + + /// Include a new blob in the read plan. + /// + /// This function is called from a B-Tree index visitor (see `DeltaLayerInner::plan_reads` + /// and `ImageLayerInner::plan_reads`). Said visitor wants to collect blob offsets for all + /// keys in a given keyspace. This function must be called for each key in the desired + /// keyspace (monotonically continuous). [`Self::handle_range_end`] must + /// be called after every range in the offset. + /// + /// In the event that keys are skipped, the behaviour is undefined and can lead to an + /// incorrect read plan. We can end up asserting, erroring in wal redo or returning + /// incorrect data to the user. + /// + /// The `flag` argument has two interesting values: + /// * [`BlobFlag::Replaces`]: The blob for this key should replace all existing blobs. + /// This is used for WAL records that `will_init`. + /// * [`BlobFlag::Ignore`]: This blob should not be included in the read. This happens + /// if the blob is cached. + pub fn handle(&mut self, key: Key, lsn: Lsn, offset: u64, flag: BlobFlag) { + // Implementation note: internally lag behind by one blob such that + // we have a start and end offset when initialising [`VectoredRead`] + let (prev_key, prev_lsn, prev_offset, prev_flag) = match self.prev { + None => { + self.prev = Some((key, lsn, offset, flag)); + return; + } + Some(prev) => prev, + }; + + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + + self.prev = Some((key, lsn, offset, flag)); + } + + pub fn handle_range_end(&mut self, offset: u64) { + if let Some((prev_key, prev_lsn, prev_offset, prev_flag)) = self.prev { + self.add_blob(prev_key, prev_lsn, prev_offset, offset, prev_flag); + } + + self.prev = None; + } + + fn add_blob(&mut self, key: Key, lsn: Lsn, start_offset: u64, end_offset: u64, flag: BlobFlag) { + match flag { + BlobFlag::None => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::Replaces => { + let blobs_for_key = self.blobs.entry(key).or_default(); + blobs_for_key.clear(); + blobs_for_key.push((lsn, start_offset, end_offset)); + } + BlobFlag::Ignore => {} + } + } + + pub fn finish(self) -> Vec { + let mut current_read_builder: Option = None; + let mut reads = Vec::new(); + + for (key, blobs_for_key) in self.blobs { + for (lsn, start_offset, end_offset) in blobs_for_key { + let extended = match &mut current_read_builder { + Some(read_builder) => { + read_builder.extend(start_offset, end_offset, BlobMeta { key, lsn }) + } + None => VectoredReadExtended::No, + }; + + if extended == VectoredReadExtended::No { + let next_read_builder = VectoredReadBuilder::new( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.max_read_size, + ); + + let prev_read_builder = current_read_builder.replace(next_read_builder); + + // `current_read_builder` is None in the first iteration of the outer loop + if let Some(read_builder) = prev_read_builder { + reads.push(read_builder.build()); + } + } + } + } + + if let Some(read_builder) = current_read_builder { + reads.push(read_builder.build()); + } + + reads + } +} + +/// Disk reader for vectored blob spans (does not go through the page cache) +pub struct VectoredBlobReader<'a> { + file: &'a VirtualFile, +} + +impl<'a> VectoredBlobReader<'a> { + pub fn new(file: &'a VirtualFile) -> Self { + Self { file } + } + + /// Read the requested blobs into the buffer. + /// + /// We have to deal with the fact that blobs are not fixed size. + /// Each blob is prefixed by a size header. + /// + /// The success return value is a struct which contains the buffer + /// filled from disk and a list of offsets at which each blob lies + /// in the buffer. + pub async fn read_blobs( + &self, + read: &VectoredRead, + buf: BytesMut, + ) -> Result { + assert!(read.size() > 0); + assert!( + read.size() <= buf.capacity(), + "{} > {}", + read.size(), + buf.capacity() + ); + let buf = self + .file + .read_exact_at_n(buf, read.start, read.size()) + .await?; + + let blobs_at = read.blobs_at.as_slice(); + let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; + + let mut metas = Vec::with_capacity(blobs_at.len()); + + // Blobs in `read` only provide their starting offset. The end offset + // of a blob is implicit: the start of the next blob if one exists + // or the end of the read. + let pairs = blobs_at.iter().zip( + blobs_at + .iter() + .map(Some) + .skip(1) + .chain(std::iter::once(None)), + ); + + for ((offset, meta), next) in pairs { + let offset_in_buf = offset - start_offset; + let first_len_byte = buf[offset_in_buf as usize]; + + // Each blob is prefixed by a header containing it's size. + // Extract the size and skip that header to find the start of the data. + // The size can be 1 or 4 bytes. The most significant bit is 0 in the + // 1 byte case and 1 in the 4 byte case. + let (size_length, blob_size) = if first_len_byte < 0x80 { + (1, first_len_byte as u64) + } else { + let mut blob_size_buf = [0u8; 4]; + let offset_in_buf = offset_in_buf as usize; + + blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); + blob_size_buf[0] &= 0x7f; + (4, u32::from_be_bytes(blob_size_buf) as u64) + }; + + let start = offset_in_buf + size_length; + let end = match next { + Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, + None => start + blob_size, + }; + + assert_eq!(end - start, blob_size); + + metas.push(VectoredBlob { + start: start as usize, + end: end as usize, + meta: *meta, + }) + } + + Ok(VectoredBlobsBuf { buf, blobs: metas }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { + assert_eq!(read.start, offset_range.first().unwrap().2); + + let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); + + let offsets_in_read: Vec<_> = read + .blobs_at + .as_slice() + .iter() + .map(|(offset, _)| *offset) + .collect(); + + assert_eq!(expected_offsets_in_read, offsets_in_read); + } + + #[test] + fn planner_max_read_size_test() { + let max_read_size = 128 * 1024; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (key, lsn, 0, BlobFlag::None), + (key, lsn, 32 * 1024, BlobFlag::None), + (key, lsn, 96 * 1024, BlobFlag::None), // Last in read 1 + (key, lsn, 128 * 1024, BlobFlag::None), // Last in read 2 + (key, lsn, 198 * 1024, BlobFlag::None), // Last in read 3 + (key, lsn, 268 * 1024, BlobFlag::None), // Last in read 4 + (key, lsn, 396 * 1024, BlobFlag::None), // Last in read 5 + (key, lsn, 652 * 1024, BlobFlag::None), // Last in read 6 + ]; + + let ranges = [ + &blob_descriptions[0..3], + &blob_descriptions[3..4], + &blob_descriptions[4..5], + &blob_descriptions[5..6], + &blob_descriptions[6..7], + &blob_descriptions[7..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + assert_eq!(reads.len(), 6); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + + #[test] + fn planner_replacement_test() { + let max_read_size = 128 * 1024; + let first_key = Key::MIN; + let second_key = first_key.next(); + let lsn = Lsn(0); + + let blob_descriptions = vec![ + (first_key, lsn, 0, BlobFlag::None), // First in read 1 + (first_key, lsn, 1024, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * 1024, BlobFlag::Replaces), + (second_key, lsn, 3 * 1024, BlobFlag::None), + (second_key, lsn, 4 * 1024, BlobFlag::Replaces), // First in read 2 + (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 + ]; + + let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions.clone() { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(6 * 1024); + + let reads = planner.finish(); + assert_eq!(reads.len(), 2); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } +} diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 858fc0ef64..b7112108f2 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -548,7 +548,18 @@ impl VirtualFile { B: IoBufMut + Send, { let (buf, res) = - read_exact_at_impl(buf, offset, |buf, offset| self.read_at(buf, offset)).await; + read_exact_at_impl(buf, offset, None, |buf, offset| self.read_at(buf, offset)).await; + res.map(|()| buf) + } + + pub async fn read_exact_at_n(&self, buf: B, offset: u64, count: usize) -> Result + where + B: IoBufMut + Send, + { + let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { + self.read_at(buf, offset) + }) + .await; res.map(|()| buf) } @@ -682,6 +693,7 @@ impl VirtualFile { pub async fn read_exact_at_impl( buf: B, mut offset: u64, + count: Option, mut read_at: F, ) -> (B, std::io::Result<()>) where @@ -689,7 +701,15 @@ where F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, Fut: std::future::Future, std::io::Result)>, { - let mut buf: tokio_epoll_uring::Slice = buf.slice_full(); // includes all the uninitialized memory + let mut buf: tokio_epoll_uring::Slice = match count { + Some(count) => { + assert!(count <= buf.bytes_total()); + assert!(count > 0); + buf.slice(..count) // may include uninitialized memory + } + None => buf.slice_full(), // includes all the uninitialized memory + }; + while buf.bytes_total() != 0 { let res; (buf, res) = read_at(buf, offset).await; @@ -779,7 +799,7 @@ mod test_read_exact_at_impl { result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), }]), })); - let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -788,13 +808,33 @@ mod test_read_exact_at_impl { assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); } + #[tokio::test] + async fn test_with_count() { + let buf = Vec::with_capacity(5); + let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { + expectations: VecDeque::from(vec![Expectation { + offset: 0, + bytes_total: 3, + result: Ok(vec![b'a', b'b', b'c']), + }]), + })); + + let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| { + let mock_read_at = Arc::clone(&mock_read_at); + async move { mock_read_at.lock().await.read_at(buf, offset).await } + }) + .await; + assert!(res.is_ok()); + assert_eq!(buf, vec![b'a', b'b', b'c']); + } + #[tokio::test] async fn test_empty_buf_issues_no_syscall() { let buf = Vec::new(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::new(), })); - let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -819,7 +859,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -850,7 +890,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 55c16f73b0..71e77334a1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1115,6 +1115,13 @@ class NeonEnv: # bounce through retries on startup self.attachment_service.start() + def attachment_service_ready(): + assert self.attachment_service.ready() is True + + # Wait for attachment service readiness to prevent unnecessary post start-up + # reconcile. + wait_until(30, 1, attachment_service_ready) + # Start up broker, pageserver and all safekeepers futs = [] with concurrent.futures.ThreadPoolExecutor( diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py new file mode 100644 index 0000000000..e2e7fffdbe --- /dev/null +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -0,0 +1,195 @@ +import asyncio +import json +from pathlib import Path +from typing import Any, Dict, Tuple + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.utils import get_scale_for_db, humantime_to_ms + +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) + + +@pytest.mark.parametrize("duration", [30]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [10]) +@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"]) +@pytest.mark.timeout(1000) +def test_basebackup_with_high_slru_count( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + get_vectored_impl: str, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + def record(metric, **kwargs): + zenbenchmark.record(metric_name=f"pageserver_basebackup.{metric}", **kwargs) + + params: Dict[str, Tuple[Any, Dict[str, Any]]] = {} + + # params from fixtures + params.update( + { + "n_tenants": (n_tenants, {"unit": ""}), + "pgbench_scale": (pgbench_scale, {"unit": ""}), + "duration": (duration, {"unit": "s"}), + } + ) + + # configure cache sizes like in prod + page_cache_size = 16384 + max_file_descriptors = 500000 + neon_env_builder.pageserver_config_override = ( + f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; " + f"get_vectored_impl='{get_vectored_impl}'; validate_vectored_get=false" + ) + params.update( + { + "pageserver_config_override.page_cache_size": ( + page_cache_size * 8192, + {"unit": "byte"}, + ), + "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}), + } + ) + + for param, (value, kwargs) in params.items(): + record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) + + n_txns = 500000 + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, n_txns) + + env = setup_pageserver_with_tenants( + neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + ) + run_benchmark(env, pg_bin, record, duration) + + +def setup_tenant_template(env: NeonEnv, n_txns: int): + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + + ps_http = env.pageserver.http_client() + + with env.endpoints.create_start( + "main", tenant_id=template_tenant, config_lines=["shared_buffers=1MB"] + ) as ep: + rels = 10 + + asyncio.run(run_updates(ep, n_txns, rels)) + + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + + return (template_tenant, template_timeline, config) + + +# Takes about 5 minutes and produces tenants with around 300 SLRU blocks +# of 8 KiB each. +async def run_updates(ep: Endpoint, n_txns: int, workers_count: int): + workers = [] + for i in range(workers_count): + workers.append(asyncio.create_task(run_update_loop_worker(ep, n_txns, i))) + + await asyncio.gather(*workers) + + +async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): + table = f"t_{idx}" + conn = await ep.connect_async() + await conn.execute(f"CREATE TABLE {table} (pk integer PRIMARY KEY, x integer)") + await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") + await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") + await conn.execute( + """ + CREATE PROCEDURE updating{0}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{1} LOOP + UPDATE {0} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql + """.format(table, n_txns) + ) + await conn.execute("SET statement_timeout=0") + await conn.execute(f"call updating{table}()") + + +def run_benchmark(env: NeonEnv, pg_bin: PgBin, record, duration_secs: int): + ps_http = env.pageserver.http_client() + cmd = [ + str(env.neon_binpath / "pagebench"), + "basebackup", + "--mgmt-api-endpoint", + ps_http.base_url, + "--page-service-connstring", + env.pageserver.connstr(password=None), + "--gzip-probability", + "1", + "--runtime", + f"{duration_secs}s", + # don't specify the targets explicitly, let pagebench auto-discover them + ] + + log.info(f"command: {' '.join(cmd)}") + basepath = pg_bin.run_capture(cmd, with_command_header=False) + results_path = Path(basepath + ".stdout") + log.info(f"Benchmark results at: {results_path}") + + with open(results_path, "r") as f: + results = json.load(f) + log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") + + total = results["total"] + metric = "request_count" + record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + + metric = "latency_mean" + record( + metric, + metric_value=humantime_to_ms(total[metric]), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) + + metric = "latency_percentiles" + for k, v in total[metric].items(): + record( + f"{metric}.{k}", + metric_value=humantime_to_ms(v), + unit="ms", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 307b3848db..8cd3569ea5 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -3,7 +3,6 @@ import os from pathlib import Path from typing import Any, Dict, Tuple -import fixtures.pageserver.many_tenants as many_tenants import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log @@ -15,7 +14,9 @@ from fixtures.neon_fixtures import ( ) from fixtures.utils import get_scale_for_db, humantime_to_ms -from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking +from performance.pageserver.util import ( + setup_pageserver_with_tenants, +) # For reference, the space usage of the snapshots: @@ -80,10 +81,77 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn( for param, (value, kwargs) in params.items(): record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs) - env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale) + + def setup_wrapper(env: NeonEnv): + return setup_tenant_template(env, pg_bin, pgbench_scale) + + env = setup_pageserver_with_tenants( + neon_env_builder, + f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}", + n_tenants, + setup_wrapper, + ) run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) +def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): + """ + Set up a template tenant which will be replicated by the test infra. + It's a pgbench tenant, initialized to a certain scale, and treated afterwards + with a repeat application of (pgbench simple-update workload, checkpoint, compact). + """ + # use a config that makes production of on-disk state timing-insensitive + # as we ingest data into the tenant. + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "0s", # disable periodic compaction + "compaction_threshold": 10, + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 3, + } + template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + env.pageserver.tenant_detach(template_tenant) + env.pageserver.allowed_errors.append( + # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely + ".*Dropped remote consistent LSN updates.*", + ) + env.pageserver.tenant_attach(template_tenant, config) + ps_http = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: + pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + for _ in range( + 0, 17 + ): # some prime number to avoid potential resonances with the "_threshold" variables from the config + # the L0s produced by this appear to have size ~5MiB + num_txns = 10_000 + pg_bin.run_capture( + ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] + ) + wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) + ps_http.timeline_checkpoint(template_tenant, template_timeline) + ps_http.timeline_compact(template_tenant, template_timeline) + # for reference, the output at scale=6 looked like so (306M total) + # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 + # total 306M + # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 + # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 + # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 + # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 + # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 + # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 + # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 + # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 + # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 + # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 + + return (template_tenant, template_timeline, config) + + def run_benchmark_max_throughput_latest_lsn( env: NeonEnv, pg_bin: PgBin, record, duration_secs: int ): @@ -138,78 +206,3 @@ def run_benchmark_max_throughput_latest_lsn( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) - - -def setup_pageserver_with_pgbench_tenants( - neon_env_builder: NeonEnvBuilder, - pg_bin: PgBin, - n_tenants: int, - scale: int, -) -> NeonEnv: - """ - Utility function to set up a pageserver with a given number of identical tenants. - Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards - with a repeat application of (pgbench simple-update workload, checkpoint, compact). - """ - - def setup_template(env: NeonEnv): - # use a config that makes production of on-disk state timing-insensitive - # as we ingest data into the tenant. - config = { - "gc_period": "0s", # disable periodic gc - "checkpoint_timeout": "10 years", - "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, - } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) - env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely - ".*Dropped remote consistent LSN updates.*", - ) - env.pageserver.tenant_attach(template_tenant, config) - ps_http = env.pageserver.http_client() - with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: - pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()]) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - for _ in range( - 0, 17 - ): # some prime number to avoid potential resonances with the "_threshold" variables from the config - # the L0s produced by this appear to have size ~5MiB - num_txns = 10_000 - pg_bin.run_capture( - ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()] - ) - wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline) - ps_http.timeline_checkpoint(template_tenant, template_timeline) - ps_http.timeline_compact(template_tenant, template_timeline) - # for reference, the output at scale=6 looked like so (306M total) - # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59 - # total 306M - # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829 - # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919 - # 33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71 - # 36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791 - # 16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1 - # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9 - # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639 - # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799 - # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19 - # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021 - - return (template_tenant, template_timeline, config) - - def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: - return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants) - - env = neon_env_builder.build_and_use_snapshot( - f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit - ) - env.start() - ensure_pageserver_ready_for_benchmarking(env, n_tenants) - return env diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 45eb652362..009d62c9ba 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -2,9 +2,16 @@ Utilities used by all code in this sub-directory """ +from typing import Any, Callable, Dict, Tuple + +import fixtures.pageserver.many_tenants as many_tenants from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + NeonEnv, + NeonEnvBuilder, +) from fixtures.pageserver.utils import wait_until_all_tenants_state +from fixtures.types import TenantId, TimelineId def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): @@ -27,3 +34,22 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): assert not layer.remote log.info("ready") + + +def setup_pageserver_with_tenants( + neon_env_builder: NeonEnvBuilder, + name: str, + n_tenants: int, + setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]], +) -> NeonEnv: + """ + Utility function to set up a pageserver with a given number of identical tenants. + """ + + def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + return many_tenants.single_timeline(neon_env_builder, setup, n_tenants) + + env = neon_env_builder.build_and_use_snapshot(name, doit) + env.start() + ensure_pageserver_ready_for_benchmarking(env, n_tenants) + return env