From 1daeba6d8711b033f12a461ca3cda1ecff921783 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 30 Jan 2024 14:10:20 +0000 Subject: [PATCH] another attempt to reduce allocations, don't know if helpers, certainly didn't eliminate all of them --- libs/utils/src/bin_ser.rs | 8 ++ pageserver/src/repository.rs | 4 +- pageserver/src/tenant/blob_io.rs | 14 ++++ pageserver/src/tenant/storage_layer.rs | 8 +- .../src/tenant/storage_layer/delta_layer.rs | 74 ++++++++++++++----- .../src/tenant/storage_layer/image_layer.rs | 5 +- .../tenant/storage_layer/inmemory_layer.rs | 31 ++------ .../tenant/timeline/reconstruct_state_pool.rs | 4 +- 8 files changed, 94 insertions(+), 54 deletions(-) diff --git a/libs/utils/src/bin_ser.rs b/libs/utils/src/bin_ser.rs index 42b45eeea0..3600191949 100644 --- a/libs/utils/src/bin_ser.rs +++ b/libs/utils/src/bin_ser.rs @@ -161,6 +161,14 @@ pub trait BeSer { be_coder().deserialize_from(r).map_err(|e| e.into()) } + /// Deserialize from a reader + fn des_from_custom<'s, R: bincode::BincodeRead<'s>>(r: R) -> Result + where + Self: DeserializeOwned, + { + be_coder().deserialize_from_custom(r).map_err(|e| e.into()) + } + /// Compute the serialized size of a data structure /// /// Note: it may be faster to serialize to a buffer and then measure the diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 8a4a1d6af0..aee2e03682 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -21,8 +21,8 @@ pub enum Value { } #[derive(Deserialize)] -pub enum ValueDe<'a> { - Image(&'a [u8]), +pub enum ValueDe { + Image(Vec), WalRecord(NeonWalRecord), } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 1344d1b29a..7c96157692 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -52,6 +52,20 @@ impl VecIsh for smallvec::SmallVec<[u8; N]> { } } +impl VecIsh for bytes::BytesMut { + fn clear(&mut self) { + bytes::BytesMut::clear(self) + } + + fn reserve(&mut self, len: usize) { + bytes::BytesMut::reserve(self, len) + } + + fn extend_from_slice(&mut self, o: &[u8]) { + bytes::BytesMut::extend_from_slice(self, o) + } +} + impl<'a> BlockCursor<'a> { /// Read a blob into a new buffer. pub async fn read_blob( diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index e7d44f20e1..984b591e97 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -10,7 +10,7 @@ mod layer_desc; use crate::context::{AccessStatsBehavior, RequestContext}; use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; -use bytes::Bytes; +use bytes::{Bytes, BytesMut}; use enum_map::EnumMap; use enumset::EnumSet; use once_cell::sync::Lazy; @@ -66,14 +66,14 @@ where #[derive(Debug)] pub struct ValueReconstructState { pub records: smallvec::SmallVec<[(Lsn, NeonWalRecord); 300]>, - pub img: Option<(Lsn, Range)>, - pub(crate) scratch: smallvec::SmallVec<[u8; 2 * PAGE_SZ]>, + pub img: Option<(Lsn, Vec)>, + pub(crate) scratch: BytesMut, } impl ValueReconstructState { pub(crate) fn img_ref(&self) -> Option<(Lsn, &[u8])> { match &self.img { - Some((lsn, range_in_scratch)) => Some((*lsn, &self.scratch[range_in_scratch.clone()])), + Some((lsn, img)) => Some((*lsn, &img)), None => None, } } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 2688e247ee..ccf71b8f36 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -40,6 +40,7 @@ use crate::virtual_file::{self, VirtualFile}; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; +use bytes::{Buf, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::models::LayerAccessKind; use pageserver_api::shard::TenantShardId; @@ -747,7 +748,7 @@ impl DeltaLayerInner { ); let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - let mut offsets = smallvec::SmallVec::<[(Lsn, u64); 4]>::new(); + let mut offsets = smallvec::SmallVec::<[(Lsn, u64); 10]>::new(); tree_reader .visit( @@ -789,7 +790,57 @@ impl DeltaLayerInner { // That's on avg 200 allocations. // Can we re-use the Vec from a buffer pool? - let val = ValueDe::des(&reconstruct_state.scratch).with_context(|| { + struct MoveVecBincodeRead { + buf: bytes::BytesMut, + } + + impl std::io::Read for MoveVecBincodeRead { + fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result { + let prefix = self.buf.split_to(buf.len()); // FIXME: handle buf < len + buf.copy_from_slice(&prefix); + Ok(buf.len()) + } + } + + impl<'storage> bincode::BincodeRead<'storage> for MoveVecBincodeRead { + #[inline(always)] + fn forward_read_str( + &mut self, + length: usize, + visitor: V, + ) -> bincode::Result + where + V: serde::de::Visitor<'storage>, + { + todo!() + } + + fn get_byte_buffer(&mut self, length: usize) -> bincode::Result> { + let data = if self.buf.len() == length { + // ensure `data` is uniquely owned so Vec::from below is fast + std::mem::take(&mut self.buf) + } else { + self.buf.split_to(length) + }; + Ok(Vec::::from(data)) + } + + fn forward_read_bytes( + &mut self, + length: usize, + visitor: V, + ) -> bincode::Result + where + V: serde::de::Visitor<'storage>, + { + todo!() + } + } + + let val = ValueDe::des_from_custom(MoveVecBincodeRead { + buf: std::mem::take(&mut reconstruct_state.scratch), + }) + .with_context(|| { format!( "Failed to deserialize file blob from virtual file {}", file.file.path @@ -797,24 +848,7 @@ impl DeltaLayerInner { })?; match val { ValueDe::Image(img) => { - let range_in_scratch = { - // TODO, we should just make .img reference .scratch, but that's Pin pain - assert!( - reconstruct_state.scratch.as_ptr_range().start - >= img.as_ptr_range().start - ); - assert!( - reconstruct_state.scratch.as_ptr_range().end <= img.as_ptr_range().end - ); - // SAFETY: TODO; probably technicall some UB in here; we checked same bounds in above assert, - // but other criteria don't hold. Probably fine though. - let start = - unsafe { img.as_ptr().offset_from(reconstruct_state.scratch.as_ptr()) }; - assert!(start >= 0); - let start = usize::try_from(start).unwrap(); - start..(start.checked_add(img.len()).unwrap()) - }; - reconstruct_state.img = Some((entry_lsn, range_in_scratch)); + reconstruct_state.img = Some((entry_lsn, img)); need_image = false; break; } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index ff98575f49..6bdfa03310 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -445,7 +445,10 @@ impl ImageLayerInner { ) .await .with_context(|| format!("failed to read value from offset {}", offset))?; - reconstruct_state.img = Some((self.lsn, 0..reconstruct_state.scratch.len())); + reconstruct_state.img = Some(( + self.lsn, + Vec::::from(std::mem::take(&mut reconstruct_state.scratch)), + )); Ok(ValueReconstructResult::Complete) } else { Ok(ValueReconstructResult::Missing) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index d372ee9e81..d6c0662fad 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -173,36 +173,17 @@ impl InMemoryLayer { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { reconstruct_state.scratch.clear(); - let buf = reader + reader .read_blob_into_buf(*pos, &mut reconstruct_state.scratch, &ctx) .await?; - let value = ValueDe::des(&reconstruct_state.scratch)?; + // TODO: avoid creating the bincode::SliceReader copy + let value = Value::des(&reconstruct_state.scratch)?; match value { - ValueDe::Image(img) => { - let range_in_scratch = { - // TODO, we should just make .img reference .scratch, but that's Pin pain - assert!( - reconstruct_state.scratch.as_ptr_range().start - >= img.as_ptr_range().start - ); - assert!( - reconstruct_state.scratch.as_ptr_range().end - <= img.as_ptr_range().end - ); - // SAFETY: TODO; probably technicall some UB in here; we checked same bounds in above assert, - // but other criteria don't hold. Probably fine though. - let start = unsafe { - img.as_ptr().offset_from(reconstruct_state.scratch.as_ptr()) - }; - assert!(start >= 0); - let start = usize::try_from(start).unwrap(); - start..(start.checked_add(img.len()).unwrap()) - }; - - reconstruct_state.img = Some((*entry_lsn, range_in_scratch)); + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img.to_vec())); return Ok(ValueReconstructResult::Complete); } - ValueDe::WalRecord(rec) => { + Value::WalRecord(rec) => { let will_init = rec.will_init(); reconstruct_state.records.push((*entry_lsn, rec)); if will_init { diff --git a/pageserver/src/tenant/timeline/reconstruct_state_pool.rs b/pageserver/src/tenant/timeline/reconstruct_state_pool.rs index 38893e78ef..4a0bac4f82 100644 --- a/pageserver/src/tenant/timeline/reconstruct_state_pool.rs +++ b/pageserver/src/tenant/timeline/reconstruct_state_pool.rs @@ -1,6 +1,6 @@ use std::cell::RefCell; -use crate::tenant::storage_layer::ValueReconstructState; +use crate::{page_cache::PAGE_SZ, tenant::storage_layer::ValueReconstructState}; struct Content(ValueReconstructState); @@ -9,7 +9,7 @@ impl Content { Content(ValueReconstructState { records: smallvec::SmallVec::new(), img: None, - scratch: smallvec::SmallVec::new(), + scratch: bytes::BytesMut::with_capacity(2 * PAGE_SZ), }) } fn reset(&mut self) {