diff --git a/Cargo.lock b/Cargo.lock index 4eea4dd4e8..08b0f21b05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3321,6 +3321,7 @@ dependencies = [ "async-compression", "async-stream", "async-trait", + "bincode", "byteorder", "bytes", "camino", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0dd9fcd0ae..d214b4f209 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -15,6 +15,7 @@ anyhow.workspace = true async-compression.workspace = true async-stream.workspace = true async-trait.workspace = true +bincode.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c726139524..8a4a1d6af0 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -20,6 +20,12 @@ pub enum Value { WalRecord(NeonWalRecord), } +#[derive(Deserialize)] +pub enum ValueDe<'a> { + Image(&'a [u8]), + WalRecord(NeonWalRecord), +} + impl Value { pub fn is_image(&self) -> bool { matches!(self, Value::Image(_)) diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index a8ec241988..e7d44f20e1 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -66,14 +66,14 @@ where #[derive(Debug)] pub struct ValueReconstructState { pub records: smallvec::SmallVec<[(Lsn, NeonWalRecord); 300]>, - pub img: Option<(Lsn, Bytes)>, + pub img: Option<(Lsn, Range)>, pub(crate) scratch: smallvec::SmallVec<[u8; 2 * PAGE_SZ]>, } impl ValueReconstructState { pub(crate) fn img_ref(&self) -> Option<(Lsn, &[u8])> { match &self.img { - Some((lsn, bytes)) => Some((*lsn, bytes.as_ref())), + Some((lsn, range_in_scratch)) => Some((*lsn, &self.scratch[range_in_scratch.clone()])), None => None, } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 13cc2af632..2688e247ee 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -30,7 +30,7 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::PAGE_SZ; -use crate::repository::{Key, Value, KEY_SIZE}; +use crate::repository::{Key, Value, ValueDe, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection}; @@ -788,19 +788,37 @@ impl DeltaLayerInner { // TODO: this one is super costly, it's allocating a Vec<> for the inner Bytes every time. // That's on avg 200 allocations. // Can we re-use the Vec from a buffer pool? - let val = Value::des(&reconstruct_state.scratch).with_context(|| { + + let val = ValueDe::des(&reconstruct_state.scratch).with_context(|| { format!( "Failed to deserialize file blob from virtual file {}", file.file.path ) })?; match val { - Value::Image(img) => { - reconstruct_state.img = Some((entry_lsn, img)); + ValueDe::Image(img) => { + let range_in_scratch = { + // TODO, we should just make .img reference .scratch, but that's Pin pain + assert!( + reconstruct_state.scratch.as_ptr_range().start + >= img.as_ptr_range().start + ); + assert!( + reconstruct_state.scratch.as_ptr_range().end <= img.as_ptr_range().end + ); + // SAFETY: TODO; probably technicall some UB in here; we checked same bounds in above assert, + // but other criteria don't hold. Probably fine though. + let start = + unsafe { img.as_ptr().offset_from(reconstruct_state.scratch.as_ptr()) }; + assert!(start >= 0); + let start = usize::try_from(start).unwrap(); + start..(start.checked_add(img.len()).unwrap()) + }; + reconstruct_state.img = Some((entry_lsn, range_in_scratch)); need_image = false; break; } - Value::WalRecord(rec) => { + ValueDe::WalRecord(rec) => { let will_init = rec.will_init(); reconstruct_state.records.push((entry_lsn, rec)); if will_init { diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index f266ebb60b..ff98575f49 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -433,19 +433,19 @@ impl ImageLayerInner { ) .await? { + reconstruct_state.scratch.clear(); let blob = file .block_cursor() - .read_blob( + .read_blob_into_buf( offset, + &mut reconstruct_state.scratch, &RequestContextBuilder::extend(ctx) .page_content_kind(PageContentKind::ImageLayerValue) .build(), ) .await .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); - - reconstruct_state.img = Some((self.lsn, value)); + reconstruct_state.img = Some((self.lsn, 0..reconstruct_state.scratch.len())); Ok(ValueReconstructResult::Complete) } else { Ok(ValueReconstructResult::Missing) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 7c9103eea8..d372ee9e81 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -6,7 +6,7 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::repository::{Key, Value}; +use crate::repository::{Key, Value, ValueDe}; use crate::tenant::block_io::BlockReader; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState}; @@ -172,14 +172,37 @@ impl InMemoryLayer { if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { - let buf = reader.read_blob(*pos, &ctx).await?; - let value = Value::des(&buf)?; + reconstruct_state.scratch.clear(); + let buf = reader + .read_blob_into_buf(*pos, &mut reconstruct_state.scratch, &ctx) + .await?; + let value = ValueDe::des(&reconstruct_state.scratch)?; match value { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); + ValueDe::Image(img) => { + let range_in_scratch = { + // TODO, we should just make .img reference .scratch, but that's Pin pain + assert!( + reconstruct_state.scratch.as_ptr_range().start + >= img.as_ptr_range().start + ); + assert!( + reconstruct_state.scratch.as_ptr_range().end + <= img.as_ptr_range().end + ); + // SAFETY: TODO; probably technicall some UB in here; we checked same bounds in above assert, + // but other criteria don't hold. Probably fine though. + let start = unsafe { + img.as_ptr().offset_from(reconstruct_state.scratch.as_ptr()) + }; + assert!(start >= 0); + let start = usize::try_from(start).unwrap(); + start..(start.checked_add(img.len()).unwrap()) + }; + + reconstruct_state.img = Some((*entry_lsn, range_in_scratch)); return Ok(ValueReconstructResult::Complete); } - Value::WalRecord(rec) => { + ValueDe::WalRecord(rec) => { let will_init = rec.will_init(); reconstruct_state.records.push((*entry_lsn, rec)); if will_init { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index ce71bc6b2e..bf0e4a763c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -11,7 +11,7 @@ mod walreceiver; pub(crate) static REQ_LRU_SIZE: AtomicUsize = AtomicUsize::new(0); use anyhow::{anyhow, bail, ensure, Context, Result}; -use bytes::Bytes; +use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use enumset::EnumSet; use fail::fail_point; @@ -4333,14 +4333,16 @@ impl Timeline { // If we have a page image, and no WAL, we're all set if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.img { + if let Some((img_lsn, img)) = data.img_ref() { trace!( "found page image for key {} at {}, no WAL redo required, req LSN {}", key, img_lsn, request_lsn, ); - Ok(img.clone()) + let mut bytes = BytesMut::new(); // TODO: avoid allocation + bytes.extend_from_slice(img); + Ok(bytes.into()) } else { Err(PageReconstructError::from(anyhow!( "base image for {key} at {request_lsn} not found"