diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 7dc5cdd089..5208fdb7b3 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -90,14 +90,30 @@ pub struct Summary { impl From<&DeltaLayer> for Summary { fn from(layer: &DeltaLayer) -> Self { + Self::expected( + layer.desc.tenant_id, + layer.desc.timeline_id, + layer.desc.key_range.clone(), + layer.desc.lsn_range.clone(), + ) + } +} + +impl Summary { + pub(super) fn expected( + tenant_id: TenantId, + timeline_id: TimelineId, + keys: Range, + lsns: Range, + ) -> Self { Self { magic: DELTA_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: layer.desc.tenant_id, - timeline_id: layer.desc.timeline_id, - key_range: layer.desc.key_range.clone(), - lsn_range: layer.desc.lsn_range.clone(), + tenant_id, + timeline_id, + key_range: keys, + lsn_range: lsns, index_start_blk: 0, index_root_blk: 0, @@ -210,6 +226,12 @@ pub struct DeltaLayerInner { file: FileBlockReader, } +impl AsRef for DeltaLayerInner { + fn as_ref(&self) -> &DeltaLayerInner { + self + } +} + impl std::fmt::Debug for DeltaLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("DeltaLayerInner") @@ -307,86 +329,16 @@ impl Layer for DeltaLayer { ctx: &RequestContext, ) -> anyhow::Result { ensure!(lsn_range.start >= self.desc.lsn_range.start); - let mut need_image = true; ensure!(self.desc.key_range.contains(&key)); - { - // Open the file and lock the metadata in memory - let inner = self - .load(LayerAccessKind::GetValueReconstructData, ctx) - .await?; + let inner = self + .load(LayerAccessKind::GetValueReconstructData, ctx) + .await?; - // Scan the page versions backwards, starting from `lsn`. - let file = &inner.file; - let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - inner.index_start_blk, - inner.index_root_blk, - file, - ); - let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - - let mut offsets: Vec<(Lsn, u64)> = Vec::new(); - - tree_reader - .visit(&search_key.0, VisitDirection::Backwards, |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); - - !blob_ref.will_init() - }) - .await?; - - // Ok, 'offsets' now contains the offsets of all the entries we need to read - let cursor = file.block_cursor(); - let mut buf = Vec::new(); - for (entry_lsn, pos) in offsets { - cursor.read_blob_into_buf(pos, &mut buf).with_context(|| { - format!( - "Failed to read blob from virtual file {}", - file.file.path.display() - ) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - file.file.path.display() - ) - })?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((entry_lsn, img)); - need_image = false; - break; - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } - } - } - // release metadata lock and close the file - } - - // If an older page image is needed to reconstruct the page, let the - // caller know. - if need_image { - Ok(ValueReconstructResult::Continue) - } else { - Ok(ValueReconstructResult::Complete) - } + inner + .get_value_reconstruct_data(key, lsn_range, reconstruct_state) + .await } /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. @@ -512,43 +464,27 @@ impl DeltaLayer { async fn load_inner(&self) -> Result> { let path = self.path(); - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open file '{}'", path.display()))?; - let file = FileBlockReader::new(file); + let summary = match &self.path_or_conf { + PathOrConf::Conf(_) => Some(Summary::from(self)), + PathOrConf::Path(_) => None, + }; - let summary_blk = file.read_blk(0)?; - let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let loaded = DeltaLayerInner::load(&path, summary)?; - match &self.path_or_conf { - PathOrConf::Conf(_) => { - let mut expected_summary = Summary::from(self); - expected_summary.index_start_blk = actual_summary.index_start_blk; - expected_summary.index_root_blk = actual_summary.index_root_blk; - if actual_summary != expected_summary { - bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); - } - } - PathOrConf::Path(path) => { - let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned(); - let expected_filename = self.filename().file_name(); + if let PathOrConf::Path(ref path) = self.path_or_conf { + // not production code - if actual_filename != expected_filename { - println!( - "warning: filename does not match what is expected from in-file summary" - ); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); - } + let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned(); + let expected_filename = self.filename().file_name(); + + if actual_filename != expected_filename { + println!("warning: filename does not match what is expected from in-file summary"); + println!("actual: {:?}", actual_filename); + println!("expected: {:?}", expected_filename); } } - debug!("loaded from {}", &path.display()); - - Ok(Arc::new(DeltaLayerInner { - file, - index_start_blk: actual_summary.index_start_blk, - index_root_blk: actual_summary.index_root_blk, - })) + Ok(Arc::new(loaded)) } /// Create a DeltaLayer struct representing an existing file on disk. @@ -617,9 +553,12 @@ impl DeltaLayer { /// Obtains all keys and value references stored in the layer /// /// The value can be obtained via the [`ValueRef::load`] function. - pub async fn load_val_refs(&self, ctx: &RequestContext) -> Result> { + pub async fn load_val_refs( + &self, + ctx: &RequestContext, + ) -> Result>)>> { let inner = self - .load(LayerAccessKind::KeyIter, ctx) + .load(LayerAccessKind::Iter, ctx) .await .context("load delta layer")?; DeltaLayerInner::load_val_refs(inner) @@ -908,15 +847,120 @@ impl Drop for DeltaLayerWriter { } impl DeltaLayerInner { - async fn load_val_refs(this: &Arc) -> Result> { - let file = &this.file; + pub(super) fn load(path: &std::path::Path, summary: Option) -> anyhow::Result { + let file = VirtualFile::open(path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + let file = FileBlockReader::new(file); + + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + + if let Some(mut expected_summary) = summary { + // production code path + expected_summary.index_start_blk = actual_summary.index_start_blk; + expected_summary.index_root_blk = actual_summary.index_root_blk; + if actual_summary != expected_summary { + bail!( + "in-file summary does not match expected summary. actual = {:?} expected = {:?}", + actual_summary, + expected_summary + ); + } + } + + Ok(DeltaLayerInner { + file, + index_start_blk: actual_summary.index_start_blk, + index_root_blk: actual_summary.index_root_blk, + }) + } + + pub(super) async fn get_value_reconstruct_data( + &self, + key: Key, + lsn_range: Range, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + let mut need_image = true; + // Scan the page versions backwards, starting from `lsn`. + let file = &self.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - this.index_start_blk, - this.index_root_blk, + self.index_start_blk, + self.index_root_blk, file, ); + let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1)); - let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new(); + let mut offsets: Vec<(Lsn, u64)> = Vec::new(); + + tree_reader + .visit(&search_key.0, VisitDirection::Backwards, |key, value| { + let blob_ref = BlobRef(value); + if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { + return false; + } + let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + if entry_lsn < lsn_range.start { + return false; + } + offsets.push((entry_lsn, blob_ref.pos())); + + !blob_ref.will_init() + }) + .await?; + + // Ok, 'offsets' now contains the offsets of all the entries we need to read + let cursor = file.block_cursor(); + let mut buf = Vec::new(); + for (entry_lsn, pos) in offsets { + cursor.read_blob_into_buf(pos, &mut buf).with_context(|| { + format!( + "Failed to read blob from virtual file {}", + file.file.path.display() + ) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + file.file.path.display() + ) + })?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((entry_lsn, img)); + need_image = false; + break; + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } + } + } + + // If an older page image is needed to reconstruct the page, let the + // caller know. + if need_image { + Ok(ValueReconstructResult::Continue) + } else { + Ok(ValueReconstructResult::Complete) + } + } + + pub(super) async fn load_val_refs + Clone>( + this: &T, + ) -> Result)>> { + let dl = this.as_ref(); + let file = &dl.file; + let tree_reader = + DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file); + + let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new(); tree_reader .visit( &[0u8; DELTA_KEY_SIZE], @@ -935,7 +979,8 @@ impl DeltaLayerInner { Ok(all_offsets) } - async fn load_keys(&self) -> Result> { + + pub(super) async fn load_keys(&self) -> Result> { let file = &self.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( self.index_start_blk, @@ -975,26 +1020,27 @@ impl DeltaLayerInner { } /// Reference to an on-disk value -pub struct ValueRef { +pub struct ValueRef> { blob_ref: BlobRef, - reader: BlockCursor, + reader: BlockCursor>, } -impl ValueRef { +impl> ValueRef { /// Loads the value from disk pub fn load(&self) -> Result { + // theoretically we *could* record an access time for each, but it does not really matter let buf = self.reader.read_blob(self.blob_ref.pos())?; let val = Value::des(&buf)?; Ok(val) } } -struct Adapter(Arc); +struct Adapter>(T); -impl BlockReader for Adapter { +impl> BlockReader for Adapter { type BlockLease = PageReadGuard<'static>; fn read_blk(&self, blknum: u32) -> Result { - self.0.file.read_blk(blknum) + self.0.as_ref().file.read_blk(blknum) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 511af71210..2824abba75 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -66,7 +66,7 @@ use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLay /// the 'index' starts at the block indicated by 'index_start_blk' /// #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -struct Summary { +pub(super) struct Summary { /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC. magic: u16, format_version: u16, @@ -85,13 +85,29 @@ struct Summary { impl From<&ImageLayer> for Summary { fn from(layer: &ImageLayer) -> Self { + Self::expected( + layer.desc.tenant_id, + layer.desc.timeline_id, + layer.desc.key_range.clone(), + layer.lsn, + ) + } +} + +impl Summary { + pub(super) fn expected( + tenant_id: TenantId, + timeline_id: TimelineId, + key_range: Range, + lsn: Lsn, + ) -> Self { Self { magic: IMAGE_FILE_MAGIC, format_version: STORAGE_FORMAT_VERSION, - tenant_id: layer.desc.tenant_id, - timeline_id: layer.desc.timeline_id, - key_range: layer.desc.key_range.clone(), - lsn: layer.lsn, + tenant_id, + timeline_id, + key_range, + lsn, index_start_blk: 0, index_root_blk: 0, @@ -136,6 +152,8 @@ pub struct ImageLayerInner { index_start_blk: u32, index_root_blk: u32, + lsn: Lsn, + /// Reader object for reading blocks from the file. file: FileBlockReader, } @@ -200,27 +218,11 @@ impl Layer for ImageLayer { let inner = self .load(LayerAccessKind::GetValueReconstructData, ctx) .await?; - - let file = &inner.file; - let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); - - let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; - key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader.get(&keybuf).await? { - let blob = file.block_cursor().read_blob(offset).with_context(|| { - format!( - "failed to read value from data file {} at offset {}", - self.path().display(), - offset - ) - })?; - let value = Bytes::from(blob); - - reconstruct_state.img = Some((self.lsn, value)); - Ok(ValueReconstructResult::Complete) - } else { - Ok(ValueReconstructResult::Missing) - } + inner + .get_value_reconstruct_data(key, reconstruct_state) + .await + // FIXME: makes no sense to dump paths + .with_context(|| format!("read {}", self.path().display())) } /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers. @@ -332,42 +334,26 @@ impl ImageLayer { async fn load_inner(&self) -> Result { let path = self.path(); - // Open the file if it's not open already. - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open file '{}'", path.display()))?; - let file = FileBlockReader::new(file); - let summary_blk = file.read_blk(0)?; - let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + let expected_summary = match &self.path_or_conf { + PathOrConf::Conf(_) => Some(Summary::from(self)), + PathOrConf::Path(_) => None, + }; - match &self.path_or_conf { - PathOrConf::Conf(_) => { - let mut expected_summary = Summary::from(self); - expected_summary.index_start_blk = actual_summary.index_start_blk; - expected_summary.index_root_blk = actual_summary.index_root_blk; + let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?; - if actual_summary != expected_summary { - bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); - } - } - PathOrConf::Path(path) => { - let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned(); - let expected_filename = self.filename().file_name(); + if let PathOrConf::Path(ref path) = self.path_or_conf { + // not production code + let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned(); + let expected_filename = self.filename().file_name(); - if actual_filename != expected_filename { - println!( - "warning: filename does not match what is expected from in-file summary" - ); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); - } + if actual_filename != expected_filename { + println!("warning: filename does not match what is expected from in-file summary"); + println!("actual: {:?}", actual_filename); + println!("expected: {:?}", expected_filename); } } - Ok(ImageLayerInner { - index_start_blk: actual_summary.index_start_blk, - index_root_blk: actual_summary.index_root_blk, - file, - }) + Ok(loaded) } /// Create an ImageLayer struct representing an existing file on disk @@ -437,6 +423,65 @@ impl ImageLayer { } } +impl ImageLayerInner { + pub(super) fn load( + path: &std::path::Path, + lsn: Lsn, + summary: Option, + ) -> anyhow::Result { + let file = VirtualFile::open(path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + let file = FileBlockReader::new(file); + let summary_blk = file.read_blk(0)?; + let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; + + if let Some(mut expected_summary) = summary { + // production code path + expected_summary.index_start_blk = actual_summary.index_start_blk; + expected_summary.index_root_blk = actual_summary.index_root_blk; + + if actual_summary != expected_summary { + bail!( + "in-file summary does not match expected summary. actual = {:?} expected = {:?}", + actual_summary, + expected_summary + ); + } + } + + Ok(ImageLayerInner { + index_start_blk: actual_summary.index_start_blk, + index_root_blk: actual_summary.index_root_blk, + lsn, + file, + }) + } + + pub(super) async fn get_value_reconstruct_data( + &self, + key: Key, + reconstruct_state: &mut ValueReconstructState, + ) -> anyhow::Result { + let file = &self.file; + let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file); + + let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; + key.write_to_byte_slice(&mut keybuf); + if let Some(offset) = tree_reader.get(&keybuf).await? { + let blob = file + .block_cursor() + .read_blob(offset) + .with_context(|| format!("failed to read value from offset {}", offset))?; + let value = Bytes::from(blob); + + reconstruct_state.img = Some((self.lsn, value)); + Ok(ValueReconstructResult::Complete) + } else { + Ok(ValueReconstructResult::Missing) + } + } +} + /// A builder object for constructing a new image layer. /// /// Usage: