From da62407fcece49bbfa0af1df8afb470fd9965b9c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 22 Dec 2021 16:55:37 +0200 Subject: [PATCH] Change the meaning of 'blknum' argument in Layer trait Previously, the 'blknum' argument of various Layer functions was the block number within the overall relation. That was pretty confusing, because an individual layer only holds data from a one segment of the relation. Furthermore, the 'put_truncation' function already dealt with per-segment size, not overall relation size, adding to the confusion. Change the meaning of the 'blknum' argument to mean the block number within the segment, not the overall relation. --- pageserver/src/layered_repository.rs | 108 +++++++++++------- .../src/layered_repository/delta_layer.rs | 61 +++++----- .../src/layered_repository/image_layer.rs | 35 +++--- .../src/layered_repository/inmemory_layer.rs | 77 +++++++------ .../src/layered_repository/storage_layer.rs | 41 ++++--- pageserver/src/repository.rs | 33 ++++-- 6 files changed, 201 insertions(+), 154 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b0856bc835..aa225751bc 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -36,8 +36,8 @@ use crate::page_cache; use crate::relish::*; use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use crate::repository::{ - GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter, - WALRecord, + BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, + TimelineWriter, WALRecord, }; use crate::tenant_mgr; use crate::walreceiver; @@ -76,7 +76,7 @@ use image_layer::ImageLayer; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; use storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentTag, RELISH_SEG_SIZE, + Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, RELISH_SEG_SIZE, }; // re-export this function so that page_cache.rs can use it. @@ -804,11 +804,11 @@ impl Timeline for LayeredTimeline { } /// Look up given page version. - fn get_page_at_lsn(&self, rel: RelishTag, blknum: u32, lsn: Lsn) -> Result { - if !rel.is_blocky() && blknum != 0 { + fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result { + if !rel.is_blocky() && rel_blknum != 0 { bail!( "invalid request for block {} for non-blocky relish {}", - blknum, + rel_blknum, rel ); } @@ -821,18 +821,18 @@ impl Timeline for LayeredTimeline { lsn, latest_gc_cutoff_lsn ); - let seg = SegmentTag::from_blknum(rel, blknum); + let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { RECONSTRUCT_TIME - .observe_closure_duration(|| self.materialize_page(seg, blknum, lsn, &*layer)) + .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes // the page. See https://github.com/zenithdb/zenith/issues/841 // // Would be nice to detect that situation better. if seg.segno > 0 && self.get_rel_exists(rel, lsn)? { - warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn); + warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); return Ok(ZERO_PAGE.clone()); } @@ -840,7 +840,7 @@ impl Timeline for LayeredTimeline { } } - fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { + fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { if !rel.is_blocky() { bail!( "invalid get_relish_size request for non-blocky relish {}", @@ -1774,7 +1774,8 @@ impl LayeredTimeline { ); match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { Some(size) => { - let last_live_seg = SegmentTag::from_blknum(seg.rel, size - 1); + let (last_live_seg, _rel_blknum) = + SegmentTag::from_blknum(seg.rel, size - 1); info!( "blocky rel size is {} last_live_seg.segno {} seg.segno {}", size, last_live_seg.segno, seg.segno @@ -1851,14 +1852,19 @@ impl LayeredTimeline { Ok(result) } - fn lookup_cached_page(&self, seg: &SegmentTag, blknum: u32, lsn: Lsn) -> Option<(Lsn, Bytes)> { + fn lookup_cached_page( + &self, + rel: &RelishTag, + rel_blknum: BlockNumber, + lsn: Lsn, + ) -> Option<(Lsn, Bytes)> { let cache = page_cache::get(); - if let RelishTag::Relation(rel_tag) = &seg.rel { + if let RelishTag::Relation(rel_tag) = &rel { let (lsn, read_guard) = cache.lookup_materialized_page( self.tenantid, self.timelineid, *rel_tag, - blknum, + rel_blknum, lsn, )?; let img = Bytes::from(read_guard.to_vec()); @@ -1874,7 +1880,7 @@ impl LayeredTimeline { fn materialize_page( &self, seg: SegmentTag, - blknum: u32, + seg_blknum: SegmentBlk, lsn: Lsn, layer: &dyn Layer, ) -> Result { @@ -1882,7 +1888,10 @@ impl LayeredTimeline { // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed // for redo. - let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&seg, blknum, lsn) { + let rel = seg.rel; + let rel_blknum = seg.segno * RELISH_SEG_SIZE + seg_blknum; + let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&rel, rel_blknum, lsn) + { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check @@ -1909,13 +1918,13 @@ impl LayeredTimeline { let mut curr_lsn = lsn; loop { let result = layer_ref - .get_page_reconstruct_data(blknum, curr_lsn, cached_lsn_opt, &mut data) + .get_page_reconstruct_data(seg_blknum, curr_lsn, cached_lsn_opt, &mut data) .with_context(|| { format!( "Failed to get reconstruct data {} {:?} {} {} {:?}", layer_ref.get_seg_tag(), layer_ref.filename(), - blknum, + seg_blknum, curr_lsn, cached_lsn_opt, ) @@ -1953,13 +1962,13 @@ impl LayeredTimeline { // but never writes the page. // // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn); + warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); return Ok(ZERO_PAGE.clone()); } bail!( "No base image found for page {} blk {} at {}/{}", - seg.rel, - blknum, + rel, + rel_blknum, self.timelineid, lsn, ); @@ -1977,7 +1986,7 @@ impl LayeredTimeline { } } - self.reconstruct_page(seg.rel, blknum, lsn, data) + self.reconstruct_page(rel, rel_blknum, lsn, data) } /// @@ -1986,7 +1995,7 @@ impl LayeredTimeline { fn reconstruct_page( &self, rel: RelishTag, - blknum: u32, + rel_blknum: BlockNumber, request_lsn: Lsn, mut data: PageReconstructData, ) -> Result { @@ -1998,14 +2007,17 @@ impl LayeredTimeline { if let Some(img) = &data.page_img { trace!( "found page image for blk {} in {} at {}, no WAL redo required", - blknum, + rel_blknum, rel, request_lsn ); Ok(img.clone()) } else { // FIXME: this ought to be an error? - warn!("Page {} blk {} at {} not found", rel, blknum, request_lsn); + warn!( + "Page {} blk {} at {} not found", + rel, rel_blknum, request_lsn + ); Ok(ZERO_PAGE.clone()) } } else { @@ -2018,23 +2030,23 @@ impl LayeredTimeline { warn!( "Base image for page {}/{} at {} not found, but got {} WAL records", rel, - blknum, + rel_blknum, request_lsn, data.records.len() ); Ok(ZERO_PAGE.clone()) } else { if data.page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn); + trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn); + trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); } let last_rec_lsn = data.records.last().unwrap().0; let img = self.walredo_mgr.request_redo( rel, - blknum, + rel_blknum, request_lsn, data.page_img.clone(), data.records, @@ -2046,7 +2058,7 @@ impl LayeredTimeline { self.tenantid, self.timelineid, *rel_tag, - blknum, + rel_blknum, last_rec_lsn, &img, ); @@ -2106,45 +2118,57 @@ impl Deref for LayeredTimelineWriter<'_> { } impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { - fn put_wal_record(&self, lsn: Lsn, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> { - if !rel.is_blocky() && blknum != 0 { + fn put_wal_record( + &self, + lsn: Lsn, + rel: RelishTag, + rel_blknum: BlockNumber, + rec: WALRecord, + ) -> Result<()> { + if !rel.is_blocky() && rel_blknum != 0 { bail!( "invalid request for block {} for non-blocky relish {}", - blknum, + rel_blknum, rel ); } ensure!(lsn.is_aligned(), "unaligned record LSN"); - let seg = SegmentTag::from_blknum(rel, blknum); + let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_wal_record(lsn, blknum, rec)?; + let delta_size = layer.put_wal_record(lsn, seg_blknum, rec)?; self.tl .increase_current_logical_size(delta_size * BLCKSZ as u32); Ok(()) } - fn put_page_image(&self, rel: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> { - if !rel.is_blocky() && blknum != 0 { + fn put_page_image( + &self, + rel: RelishTag, + rel_blknum: BlockNumber, + lsn: Lsn, + img: Bytes, + ) -> Result<()> { + if !rel.is_blocky() && rel_blknum != 0 { bail!( "invalid request for block {} for non-blocky relish {}", - blknum, + rel_blknum, rel ); } ensure!(lsn.is_aligned(), "unaligned record LSN"); - let seg = SegmentTag::from_blknum(rel, blknum); + let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_page_image(blknum, lsn, img)?; + let delta_size = layer.put_page_image(seg_blknum, lsn, img)?; self.tl .increase_current_logical_size(delta_size * BLCKSZ as u32); Ok(()) } - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> Result<()> { + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: BlockNumber) -> Result<()> { if !rel.is_blocky() { bail!("invalid truncation for non-blocky relish {}", rel); } @@ -2232,7 +2256,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { } } else { // TODO handle TwoPhase relishes - let seg = SegmentTag::from_blknum(rel, 0); + let (seg, _seg_blknum) = SegmentTag::from_blknum(rel, 0); let layer = self.tl.get_layer_for_write(seg, lsn)?; layer.drop_segment(lsn); } diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 11eec36824..b345e55347 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -11,7 +11,7 @@ //! can happen when you create a new branch in the middle of a delta layer, and the WAL //! records on the new branch are put in a new delta layer. //! -//! When a delta file needs to be accessed, we slurp the metadata and relsize chapters +//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters //! into memory, into the DeltaLayerInner struct. See load() and unload() functions. //! To access a page/WAL record, we search `page_version_metas` for the block # and LSN. //! The byte ranges in the metadata can be used to find the page/WAL record in @@ -35,13 +35,14 @@ //! file contents in any way. //! //! A detlta file is constructed using the 'bookfile' crate. Each file consists of two -//! parts: the page versions and the relation sizes. They are stored as separate chapters. +//! parts: the page versions and the segment sizes. They are stored as separate chapters. //! use crate::layered_repository::blob::BlobWriter; use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; use crate::layered_repository::page_versions::PageVersions; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE, + Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, + RELISH_SEG_SIZE, }; use crate::virtual_file::VirtualFile; use crate::walrecord; @@ -76,7 +77,7 @@ static PAGE_VERSION_METAS_CHAPTER: u64 = 1; /// Page/WAL bytes - cannot be interpreted /// without PAGE_VERSION_METAS_CHAPTER static PAGE_VERSIONS_CHAPTER: u64 = 2; -static REL_SIZES_CHAPTER: u64 = 3; +static SEG_SIZES_CHAPTER: u64 = 3; /// Contains the [`Summary`] struct static SUMMARY_CHAPTER: u64 = 4; @@ -136,7 +137,7 @@ pub struct DeltaLayer { } pub struct DeltaLayerInner { - /// If false, the 'page_version_metas' and 'relsizes' have not been + /// If false, the 'page_version_metas' and 'seg_sizes' have not been /// loaded into memory yet. loaded: bool, @@ -144,16 +145,16 @@ pub struct DeltaLayerInner { /// All versions of all pages in the file are are kept here. /// Indexed by block number and LSN. - page_version_metas: VecMap<(u32, Lsn), BlobRange>, + page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, - /// `relsizes` tracks the size of the relation at different points in time. - relsizes: VecMap, + /// `seg_sizes` tracks the size of the relation at different points in time. + seg_sizes: VecMap, } impl DeltaLayerInner { - fn get_seg_size(&self, lsn: Lsn) -> Result { + fn get_seg_size(&self, lsn: Lsn) -> Result { let slice = self - .relsizes + .seg_sizes .slice_range((Included(&Lsn(0)), Included(&lsn))); if let Some((_entry_lsn, entry)) = slice.last() { Ok(*entry) @@ -195,14 +196,14 @@ impl Layer for DeltaLayer { /// Look up given page in the cache. fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result { let mut need_image = true; - assert!(self.seg.blknum_in_seg(blknum)); + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); match &cached_img_lsn { Some(cached_lsn) if &self.end_lsn <= cached_lsn => { @@ -261,7 +262,7 @@ impl Layer for DeltaLayer { if need_image && reconstruct_data.records.is_empty() && self.seg.rel.is_blocky() - && blknum - self.seg.segno * RELISH_SEG_SIZE >= inner.get_seg_size(lsn)? + && blknum >= inner.get_seg_size(lsn)? { return Ok(PageReconstructResult::Missing(self.start_lsn)); } @@ -279,7 +280,7 @@ impl Layer for DeltaLayer { } /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { + fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); ensure!( self.seg.rel.is_blocky(), @@ -309,7 +310,7 @@ impl Layer for DeltaLayer { fn unload(&self) -> Result<()> { let mut inner = self.inner.lock().unwrap(); inner.page_version_metas = VecMap::default(); - inner.relsizes = VecMap::default(); + inner.seg_sizes = VecMap::default(); inner.loaded = false; // Note: we keep the Book open. Is that a good idea? The virtual file @@ -340,9 +341,9 @@ impl Layer for DeltaLayer { self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn ); - println!("--- relsizes ---"); + println!("--- seg sizes ---"); let inner = self.load()?; - for (k, v) in inner.relsizes.as_slice() { + for (k, v) in inner.seg_sizes.as_slice() { println!(" {}: {}", k, v); } println!("--- page versions ---"); @@ -396,12 +397,12 @@ impl DeltaLayer { } } - /// Create a new delta file, using the given page versions and relsizes. + /// Create a new delta file, using the given page versions and seg_sizes. /// The page versions are passed in a PageVersions struct. If 'cutoff' is /// given, only page versions with LSN < cutoff are included. /// /// This is used to write the in-memory layer to disk. The page_versions and - /// relsizes are thus passed in the same format as they are in the in-memory + /// seg_sizes are thus passed in the same format as they are in the in-memory /// layer, as that's expedient. #[allow(clippy::too_many_arguments)] pub fn create( @@ -414,10 +415,10 @@ impl DeltaLayer { dropped: bool, page_versions: &PageVersions, cutoff: Option, - relsizes: VecMap, + seg_sizes: VecMap, ) -> Result { if seg.rel.is_blocky() { - assert!(!relsizes.is_empty()); + assert!(!seg_sizes.is_empty()); } let delta_layer = DeltaLayer { @@ -432,7 +433,7 @@ impl DeltaLayer { loaded: false, book: None, page_version_metas: VecMap::default(), - relsizes, + seg_sizes, }), }; let mut inner = delta_layer.inner.lock().unwrap(); @@ -471,9 +472,9 @@ impl DeltaLayer { chapter.write_all(&buf)?; let book = chapter.close()?; - // and relsizes to separate chapter - let mut chapter = book.new_chapter(REL_SIZES_CHAPTER); - let buf = VecMap::ser(&inner.relsizes)?; + // and seg_sizes to separate chapter + let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER); + let buf = VecMap::ser(&inner.seg_sizes)?; chapter.write_all(&buf)?; let book = chapter.close()?; @@ -550,13 +551,13 @@ impl DeltaLayer { let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?; let page_version_metas = VecMap::des(&chapter)?; - let chapter = book.read_chapter(REL_SIZES_CHAPTER)?; - let relsizes = VecMap::des(&chapter)?; + let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?; + let seg_sizes = VecMap::des(&chapter)?; debug!("loaded from {}", &path.display()); inner.page_version_metas = page_version_metas; - inner.relsizes = relsizes; + inner.seg_sizes = seg_sizes; inner.loaded = true; Ok(inner) @@ -581,7 +582,7 @@ impl DeltaLayer { loaded: false, book: None, page_version_metas: VecMap::default(), - relsizes: VecMap::default(), + seg_sizes: VecMap::default(), }), } } @@ -608,7 +609,7 @@ impl DeltaLayer { loaded: false, book: None, page_version_metas: VecMap::default(), - relsizes: VecMap::default(), + seg_sizes: VecMap::default(), }), }) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 22cf3347d8..3154934287 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -23,7 +23,7 @@ //! use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentTag, + Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, }; use crate::layered_repository::LayeredTimeline; use crate::layered_repository::RELISH_SEG_SIZE; @@ -99,7 +99,7 @@ pub struct ImageLayer { #[derive(Clone)] enum ImageType { - Blocky { num_blocks: u32 }, + Blocky { num_blocks: SegmentBlk }, NonBlocky, } @@ -144,11 +144,12 @@ impl Layer for ImageLayer { /// Look up given page in the file fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result { + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); assert!(lsn >= self.lsn); match cached_img_lsn { @@ -158,17 +159,15 @@ impl Layer for ImageLayer { let inner = self.load()?; - let base_blknum = blknum % RELISH_SEG_SIZE; - let buf = match &inner.image_type { ImageType::Blocky { num_blocks } => { // Check if the request is beyond EOF - if base_blknum >= *num_blocks { + if blknum >= *num_blocks { return Ok(PageReconstructResult::Missing(lsn)); } let mut buf = vec![0u8; BLOCK_SIZE]; - let offset = BLOCK_SIZE as u64 * base_blknum as u64; + let offset = BLOCK_SIZE as u64 * blknum as u64; let chapter = inner .book @@ -180,7 +179,7 @@ impl Layer for ImageLayer { buf } ImageType::NonBlocky => { - ensure!(base_blknum == 0); + ensure!(blknum == 0); inner .book .as_ref() @@ -195,7 +194,7 @@ impl Layer for ImageLayer { } /// Get size of the segment - fn get_seg_size(&self, _lsn: Lsn) -> Result { + fn get_seg_size(&self, _lsn: Lsn) -> Result { let inner = self.load()?; match inner.image_type { ImageType::Blocky { num_blocks } => Ok(num_blocks), @@ -276,7 +275,7 @@ impl ImageLayer { base_images: Vec, ) -> Result { let image_type = if seg.rel.is_blocky() { - let num_blocks: u32 = base_images.len().try_into()?; + let num_blocks: SegmentBlk = base_images.len().try_into()?; ImageType::Blocky { num_blocks } } else { assert_eq!(base_images.len(), 1); @@ -358,15 +357,11 @@ impl ImageLayer { let seg = src.get_seg_tag(); let timelineid = timeline.timelineid; - let startblk; - let size; - if seg.rel.is_blocky() { - size = src.get_seg_size(lsn)?; - startblk = seg.segno * RELISH_SEG_SIZE; + let size = if seg.rel.is_blocky() { + src.get_seg_size(lsn)? } else { - size = 1; - startblk = 0; - } + 1 + }; trace!( "creating new ImageLayer for {} on timeline {} at {}", @@ -376,7 +371,7 @@ impl ImageLayer { ); let mut base_images: Vec = Vec::new(); - for blknum in startblk..(startblk + size) { + for blknum in 0..size { let img = timeline.materialize_page(seg, blknum, lsn, &*src)?; base_images.push(img); @@ -435,7 +430,7 @@ impl ImageLayer { let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?; let images_len = chapter.len(); ensure!(images_len % BLOCK_SIZE as u64 == 0); - let num_blocks: u32 = (images_len / BLOCK_SIZE as u64).try_into()?; + let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?; ImageType::Blocky { num_blocks } } else { let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index dd02c8f951..4ae6e80824 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -7,7 +7,8 @@ use crate::layered_repository::ephemeral_file::EphemeralFile; use crate::layered_repository::filename::DeltaFileName; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE, + Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, + RELISH_SEG_SIZE, }; use crate::layered_repository::LayeredTimeline; use crate::layered_repository::ZERO_PAGE; @@ -64,13 +65,13 @@ pub struct InMemoryLayerInner { page_versions: PageVersions, /// - /// `segsizes` tracks the size of the segment at different points in time. + /// `seg_sizes` tracks the size of the segment at different points in time. /// /// For a blocky rel, there is always one entry, at the layer's start_lsn, /// so that determining the size never depends on the predecessor layer. For - /// a non-blocky rel, 'segsizes' is not used and is always empty. + /// a non-blocky rel, 'seg_sizes' is not used and is always empty. /// - segsizes: VecMap, + seg_sizes: VecMap, } impl InMemoryLayerInner { @@ -78,9 +79,9 @@ impl InMemoryLayerInner { assert!(self.end_lsn.is_none()); } - fn get_seg_size(&self, lsn: Lsn) -> u32 { + fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk { // Scan the BTreeMap backwards, starting from the given entry. - let slice = self.segsizes.slice_range(..=lsn); + let slice = self.seg_sizes.slice_range(..=lsn); // We make sure there is always at least one entry if let Some((_entry_lsn, entry)) = slice.last() { @@ -150,14 +151,14 @@ impl Layer for InMemoryLayer { /// Look up given page in the cache. fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result { let mut need_image = true; - assert!(self.seg.blknum_in_seg(blknum)); + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); { let inner = self.inner.read().unwrap(); @@ -198,7 +199,7 @@ impl Layer for InMemoryLayer { if need_image && reconstruct_data.records.is_empty() && self.seg.rel.is_blocky() - && blknum - self.seg.segno * RELISH_SEG_SIZE >= self.get_seg_size(lsn)? + && blknum >= self.get_seg_size(lsn)? { return Ok(PageReconstructResult::Missing(self.start_lsn)); } @@ -220,7 +221,7 @@ impl Layer for InMemoryLayer { } /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { + fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); ensure!( self.seg.rel.is_blocky(), @@ -291,8 +292,8 @@ impl Layer for InMemoryLayer { self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped, ); - for (k, v) in inner.segsizes.as_slice() { - println!("segsizes {}: {}", k, v); + for (k, v) in inner.seg_sizes.as_slice() { + println!("seg_sizes {}: {}", k, v); } for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) { @@ -339,10 +340,10 @@ impl InMemoryLayer { start_lsn ); - // The segment is initially empty, so initialize 'segsizes' with 0. - let mut segsizes = VecMap::default(); + // The segment is initially empty, so initialize 'seg_sizes' with 0. + let mut seg_sizes = VecMap::default(); if seg.rel.is_blocky() { - segsizes.append(start_lsn, 0).unwrap(); + seg_sizes.append(start_lsn, 0).unwrap(); } let file = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -359,7 +360,7 @@ impl InMemoryLayer { end_lsn: None, dropped: false, page_versions: PageVersions::new(file), - segsizes, + seg_sizes, }), }) } @@ -367,19 +368,19 @@ impl InMemoryLayer { // Write operations /// Remember new page version, as a WAL record over previous version - pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result { + pub fn put_wal_record(&self, lsn: Lsn, blknum: SegmentBlk, rec: WALRecord) -> Result { self.put_page_version(blknum, lsn, PageVersion::Wal(rec)) } /// Remember new page version, as a full page image - pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result { + pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result { self.put_page_version(blknum, lsn, PageVersion::Page(img)) } /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result { - assert!(self.seg.blknum_in_seg(blknum)); + pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result { + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); trace!( "put_page_version blk {} of {} at {}/{}", @@ -404,7 +405,7 @@ impl InMemoryLayer { // Also update the relation size, if this extended the relation. if self.seg.rel.is_blocky() { - let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1; + let newsize = blknum + 1; // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock, // which we've just acquired above @@ -426,8 +427,7 @@ impl InMemoryLayer { // PostgreSQL writes its WAL records and there's no guarantee of it. If it does // happen, we would hit the "page version already exists" warning above on the // subsequent call to initialize the gap page. - let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize; - for gapblknum in gapstart..blknum { + for gapblknum in oldsize..blknum { let zeropv = PageVersion::Page(ZERO_PAGE.clone()); trace!( "filling gap blk {} with zeros for write of {}", @@ -441,13 +441,13 @@ impl InMemoryLayer { if old.is_some() { warn!( - "Page version of rel {} blk {} at {} already exists", - self.seg.rel, blknum, lsn + "Page version of seg {} blk {} at {} already exists", + self.seg, blknum, lsn ); } } - inner.segsizes.append_or_update_last(lsn, newsize).unwrap(); + inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap(); return Ok(newsize - oldsize); } } @@ -456,7 +456,7 @@ impl InMemoryLayer { } /// Remember that the relation was truncated at given LSN - pub fn put_truncation(&self, lsn: Lsn, segsize: u32) { + pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) { assert!( self.seg.rel.is_blocky(), "put_truncation() called on a non-blocky rel" @@ -466,10 +466,13 @@ impl InMemoryLayer { inner.assert_writeable(); // check that this we truncate to a smaller size than segment was before the truncation - let oldsize = inner.get_seg_size(lsn); - assert!(segsize < oldsize); + let old_size = inner.get_seg_size(lsn); + assert!(new_size < old_size); - let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap(); + let (old, _delta_size) = inner + .seg_sizes + .append_or_update_last(lsn, new_size) + .unwrap(); if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -515,10 +518,10 @@ impl InMemoryLayer { ); // Copy the segment size at the start LSN from the predecessor layer. - let mut segsizes = VecMap::default(); + let mut seg_sizes = VecMap::default(); if seg.rel.is_blocky() { let size = src.get_seg_size(start_lsn)?; - segsizes.append(start_lsn, size).unwrap(); + seg_sizes.append(start_lsn, size).unwrap(); } let file = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -535,7 +538,7 @@ impl InMemoryLayer { end_lsn: None, dropped: false, page_versions: PageVersions::new(file), - segsizes, + seg_sizes, }), }) } @@ -558,7 +561,7 @@ impl InMemoryLayer { assert!(self.start_lsn < end_lsn + 1); inner.end_lsn = Some(Lsn(end_lsn.0 + 1)); - if let Some((lsn, _)) = inner.segsizes.as_slice().last() { + if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() { assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn); } @@ -606,7 +609,7 @@ impl InMemoryLayer { true, &inner.page_versions, None, - inner.segsizes.clone(), + inner.seg_sizes.clone(), )?; trace!( "freeze: created delta layer for dropped segment {} {}-{}", @@ -628,7 +631,7 @@ impl InMemoryLayer { let mut delta_layers = Vec::new(); if self.start_lsn != end_lsn_inclusive { - let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive); + let (seg_sizes, _) = inner.seg_sizes.split_at(&end_lsn_exclusive); // Write the page versions before the cutoff to disk. let delta_layer = DeltaLayer::create( self.conf, @@ -640,7 +643,7 @@ impl InMemoryLayer { false, &inner.page_versions, Some(end_lsn_inclusive), - segsizes, + seg_sizes, )?; delta_layers.push(delta_layer); trace!( diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 5988f28255..dab650d2ab 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -3,7 +3,7 @@ //! use crate::relish::RelishTag; -use crate::repository::WALRecord; +use crate::repository::{BlockNumber, WALRecord}; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; @@ -26,6 +26,18 @@ pub struct SegmentTag { pub segno: u32, } +/// SegmentBlk represents a block number within a segment, or the size of segment. +/// +/// This is separate from BlockNumber, which is used for block number within the +/// whole relish. Since this is just a type alias, the compiler will let you mix +/// them freely, but we use the type alias as documentation to make it clear +/// which one we're dealing with. +/// +/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally +/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes +/// operations more verbose). +pub type SegmentBlk = u32; + impl fmt::Display for SegmentTag { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}.{}", self.rel, self.segno) @@ -33,15 +45,16 @@ impl fmt::Display for SegmentTag { } impl SegmentTag { - pub const fn from_blknum(rel: RelishTag, blknum: u32) -> SegmentTag { - SegmentTag { - rel, - segno: blknum / RELISH_SEG_SIZE, - } - } - - pub fn blknum_in_seg(&self, blknum: u32) -> bool { - blknum / RELISH_SEG_SIZE == self.segno + /// Given a relish and block number, calculate the corresponding segment and + /// block number within the segment. + pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) { + ( + SegmentTag { + rel, + segno: blknum / RELISH_SEG_SIZE, + }, + blknum % RELISH_SEG_SIZE, + ) } } @@ -125,10 +138,6 @@ pub trait Layer: Send + Sync { /// It is up to the caller to collect more data from previous layer and /// perform WAL redo, if necessary. /// - /// Note that the 'blknum' is the offset of the page from the beginning - /// of the *relish*, not the beginning of the segment. The requested - /// 'blknum' must be covered by this segment. - /// /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`. /// This function will only return data after `cached_img_lsn`. /// @@ -139,14 +148,14 @@ pub trait Layer: Send + Sync { /// to collect more data. fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result; /// Return size of the segment at given LSN. (Only for blocky relations.) - fn get_seg_size(&self, lsn: Lsn) -> Result; + fn get_seg_size(&self, lsn: Lsn) -> Result; /// Does the segment exist at given LSN? Or was it dropped before it. fn get_seg_exists(&self, lsn: Lsn) -> Result; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 077969131f..74f2969ecd 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -10,6 +10,9 @@ use std::time::Duration; use zenith_utils::lsn::{Lsn, RecordLsn}; use zenith_utils::zid::ZTimelineId; +/// Block number within a relish. This matches PostgreSQL's BlockNumber type. +pub type BlockNumber = u32; + /// /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. @@ -180,10 +183,10 @@ pub trait Timeline: Send + Sync { fn wait_lsn(&self, lsn: Lsn) -> Result<()>; /// Look up given page version. - fn get_page_at_lsn(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result; + fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result; /// Get size of a relish - fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; + fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; /// Does relation exist? fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result; @@ -255,13 +258,25 @@ pub trait TimelineWriter: Deref { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>; + fn put_wal_record( + &self, + lsn: Lsn, + tag: RelishTag, + blknum: BlockNumber, + rec: WALRecord, + ) -> Result<()>; /// Like put_wal_record, but with ready-made image of the page. - fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>; + fn put_page_image( + &self, + tag: RelishTag, + blknum: BlockNumber, + lsn: Lsn, + img: Bytes, + ) -> Result<()>; /// Truncate relation - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>; + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: BlockNumber) -> Result<()>; /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; @@ -360,7 +375,7 @@ pub mod repo_harness { fn request_redo( &self, rel: RelishTag, - blknum: u32, + blknum: BlockNumber, lsn: Lsn, base_img: Option, records: Vec<(Lsn, WALRecord)>, @@ -690,7 +705,7 @@ mod tests { for blknum in 0..pg_constants::RELSEG_SIZE + 1 { let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); lsn += 0x10; - writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?; + writer.put_page_image(TESTREL_A, blknum as BlockNumber, Lsn(lsn), img)?; } writer.advance_last_record_lsn(Lsn(lsn)); @@ -726,11 +741,11 @@ mod tests { let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?; + writer.put_truncation(TESTREL_A, Lsn(lsn), size as BlockNumber)?; writer.advance_last_record_lsn(Lsn(lsn)); assert_eq!( tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - size as u32 + size as BlockNumber ); size -= 1;