diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b0856bc835..aa225751bc 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -36,8 +36,8 @@ use crate::page_cache; use crate::relish::*; use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use crate::repository::{ - GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter, - WALRecord, + BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, + TimelineWriter, WALRecord, }; use crate::tenant_mgr; use crate::walreceiver; @@ -76,7 +76,7 @@ use image_layer::ImageLayer; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; use storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentTag, RELISH_SEG_SIZE, + Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, RELISH_SEG_SIZE, }; // re-export this function so that page_cache.rs can use it. @@ -804,11 +804,11 @@ impl Timeline for LayeredTimeline { } /// Look up given page version. - fn get_page_at_lsn(&self, rel: RelishTag, blknum: u32, lsn: Lsn) -> Result { - if !rel.is_blocky() && blknum != 0 { + fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result { + if !rel.is_blocky() && rel_blknum != 0 { bail!( "invalid request for block {} for non-blocky relish {}", - blknum, + rel_blknum, rel ); } @@ -821,18 +821,18 @@ impl Timeline for LayeredTimeline { lsn, latest_gc_cutoff_lsn ); - let seg = SegmentTag::from_blknum(rel, blknum); + let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { RECONSTRUCT_TIME - .observe_closure_duration(|| self.materialize_page(seg, blknum, lsn, &*layer)) + .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes // the page. See https://github.com/zenithdb/zenith/issues/841 // // Would be nice to detect that situation better. if seg.segno > 0 && self.get_rel_exists(rel, lsn)? { - warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn); + warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); return Ok(ZERO_PAGE.clone()); } @@ -840,7 +840,7 @@ impl Timeline for LayeredTimeline { } } - fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { + fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { if !rel.is_blocky() { bail!( "invalid get_relish_size request for non-blocky relish {}", @@ -1774,7 +1774,8 @@ impl LayeredTimeline { ); match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { Some(size) => { - let last_live_seg = SegmentTag::from_blknum(seg.rel, size - 1); + let (last_live_seg, _rel_blknum) = + SegmentTag::from_blknum(seg.rel, size - 1); info!( "blocky rel size is {} last_live_seg.segno {} seg.segno {}", size, last_live_seg.segno, seg.segno @@ -1851,14 +1852,19 @@ impl LayeredTimeline { Ok(result) } - fn lookup_cached_page(&self, seg: &SegmentTag, blknum: u32, lsn: Lsn) -> Option<(Lsn, Bytes)> { + fn lookup_cached_page( + &self, + rel: &RelishTag, + rel_blknum: BlockNumber, + lsn: Lsn, + ) -> Option<(Lsn, Bytes)> { let cache = page_cache::get(); - if let RelishTag::Relation(rel_tag) = &seg.rel { + if let RelishTag::Relation(rel_tag) = &rel { let (lsn, read_guard) = cache.lookup_materialized_page( self.tenantid, self.timelineid, *rel_tag, - blknum, + rel_blknum, lsn, )?; let img = Bytes::from(read_guard.to_vec()); @@ -1874,7 +1880,7 @@ impl LayeredTimeline { fn materialize_page( &self, seg: SegmentTag, - blknum: u32, + seg_blknum: SegmentBlk, lsn: Lsn, layer: &dyn Layer, ) -> Result { @@ -1882,7 +1888,10 @@ impl LayeredTimeline { // The cached image can be returned directly if there is no WAL between the cached image // and requested LSN. The cached image can also be used to reduce the amount of WAL needed // for redo. - let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&seg, blknum, lsn) { + let rel = seg.rel; + let rel_blknum = seg.segno * RELISH_SEG_SIZE + seg_blknum; + let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&rel, rel_blknum, lsn) + { Some((cached_lsn, cached_img)) => { match cached_lsn.cmp(&lsn) { cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check @@ -1909,13 +1918,13 @@ impl LayeredTimeline { let mut curr_lsn = lsn; loop { let result = layer_ref - .get_page_reconstruct_data(blknum, curr_lsn, cached_lsn_opt, &mut data) + .get_page_reconstruct_data(seg_blknum, curr_lsn, cached_lsn_opt, &mut data) .with_context(|| { format!( "Failed to get reconstruct data {} {:?} {} {} {:?}", layer_ref.get_seg_tag(), layer_ref.filename(), - blknum, + seg_blknum, curr_lsn, cached_lsn_opt, ) @@ -1953,13 +1962,13 @@ impl LayeredTimeline { // but never writes the page. // // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn); + warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); return Ok(ZERO_PAGE.clone()); } bail!( "No base image found for page {} blk {} at {}/{}", - seg.rel, - blknum, + rel, + rel_blknum, self.timelineid, lsn, ); @@ -1977,7 +1986,7 @@ impl LayeredTimeline { } } - self.reconstruct_page(seg.rel, blknum, lsn, data) + self.reconstruct_page(rel, rel_blknum, lsn, data) } /// @@ -1986,7 +1995,7 @@ impl LayeredTimeline { fn reconstruct_page( &self, rel: RelishTag, - blknum: u32, + rel_blknum: BlockNumber, request_lsn: Lsn, mut data: PageReconstructData, ) -> Result { @@ -1998,14 +2007,17 @@ impl LayeredTimeline { if let Some(img) = &data.page_img { trace!( "found page image for blk {} in {} at {}, no WAL redo required", - blknum, + rel_blknum, rel, request_lsn ); Ok(img.clone()) } else { // FIXME: this ought to be an error? - warn!("Page {} blk {} at {} not found", rel, blknum, request_lsn); + warn!( + "Page {} blk {} at {} not found", + rel, rel_blknum, request_lsn + ); Ok(ZERO_PAGE.clone()) } } else { @@ -2018,23 +2030,23 @@ impl LayeredTimeline { warn!( "Base image for page {}/{} at {} not found, but got {} WAL records", rel, - blknum, + rel_blknum, request_lsn, data.records.len() ); Ok(ZERO_PAGE.clone()) } else { if data.page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn); + trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn); + trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); } let last_rec_lsn = data.records.last().unwrap().0; let img = self.walredo_mgr.request_redo( rel, - blknum, + rel_blknum, request_lsn, data.page_img.clone(), data.records, @@ -2046,7 +2058,7 @@ impl LayeredTimeline { self.tenantid, self.timelineid, *rel_tag, - blknum, + rel_blknum, last_rec_lsn, &img, ); @@ -2106,45 +2118,57 @@ impl Deref for LayeredTimelineWriter<'_> { } impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { - fn put_wal_record(&self, lsn: Lsn, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> { - if !rel.is_blocky() && blknum != 0 { + fn put_wal_record( + &self, + lsn: Lsn, + rel: RelishTag, + rel_blknum: BlockNumber, + rec: WALRecord, + ) -> Result<()> { + if !rel.is_blocky() && rel_blknum != 0 { bail!( "invalid request for block {} for non-blocky relish {}", - blknum, + rel_blknum, rel ); } ensure!(lsn.is_aligned(), "unaligned record LSN"); - let seg = SegmentTag::from_blknum(rel, blknum); + let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_wal_record(lsn, blknum, rec)?; + let delta_size = layer.put_wal_record(lsn, seg_blknum, rec)?; self.tl .increase_current_logical_size(delta_size * BLCKSZ as u32); Ok(()) } - fn put_page_image(&self, rel: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> { - if !rel.is_blocky() && blknum != 0 { + fn put_page_image( + &self, + rel: RelishTag, + rel_blknum: BlockNumber, + lsn: Lsn, + img: Bytes, + ) -> Result<()> { + if !rel.is_blocky() && rel_blknum != 0 { bail!( "invalid request for block {} for non-blocky relish {}", - blknum, + rel_blknum, rel ); } ensure!(lsn.is_aligned(), "unaligned record LSN"); - let seg = SegmentTag::from_blknum(rel, blknum); + let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_page_image(blknum, lsn, img)?; + let delta_size = layer.put_page_image(seg_blknum, lsn, img)?; self.tl .increase_current_logical_size(delta_size * BLCKSZ as u32); Ok(()) } - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> Result<()> { + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: BlockNumber) -> Result<()> { if !rel.is_blocky() { bail!("invalid truncation for non-blocky relish {}", rel); } @@ -2232,7 +2256,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { } } else { // TODO handle TwoPhase relishes - let seg = SegmentTag::from_blknum(rel, 0); + let (seg, _seg_blknum) = SegmentTag::from_blknum(rel, 0); let layer = self.tl.get_layer_for_write(seg, lsn)?; layer.drop_segment(lsn); } diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 11eec36824..b345e55347 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -11,7 +11,7 @@ //! can happen when you create a new branch in the middle of a delta layer, and the WAL //! records on the new branch are put in a new delta layer. //! -//! When a delta file needs to be accessed, we slurp the metadata and relsize chapters +//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters //! into memory, into the DeltaLayerInner struct. See load() and unload() functions. //! To access a page/WAL record, we search `page_version_metas` for the block # and LSN. //! The byte ranges in the metadata can be used to find the page/WAL record in @@ -35,13 +35,14 @@ //! file contents in any way. //! //! A detlta file is constructed using the 'bookfile' crate. Each file consists of two -//! parts: the page versions and the relation sizes. They are stored as separate chapters. +//! parts: the page versions and the segment sizes. They are stored as separate chapters. //! use crate::layered_repository::blob::BlobWriter; use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; use crate::layered_repository::page_versions::PageVersions; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE, + Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, + RELISH_SEG_SIZE, }; use crate::virtual_file::VirtualFile; use crate::walrecord; @@ -76,7 +77,7 @@ static PAGE_VERSION_METAS_CHAPTER: u64 = 1; /// Page/WAL bytes - cannot be interpreted /// without PAGE_VERSION_METAS_CHAPTER static PAGE_VERSIONS_CHAPTER: u64 = 2; -static REL_SIZES_CHAPTER: u64 = 3; +static SEG_SIZES_CHAPTER: u64 = 3; /// Contains the [`Summary`] struct static SUMMARY_CHAPTER: u64 = 4; @@ -136,7 +137,7 @@ pub struct DeltaLayer { } pub struct DeltaLayerInner { - /// If false, the 'page_version_metas' and 'relsizes' have not been + /// If false, the 'page_version_metas' and 'seg_sizes' have not been /// loaded into memory yet. loaded: bool, @@ -144,16 +145,16 @@ pub struct DeltaLayerInner { /// All versions of all pages in the file are are kept here. /// Indexed by block number and LSN. - page_version_metas: VecMap<(u32, Lsn), BlobRange>, + page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, - /// `relsizes` tracks the size of the relation at different points in time. - relsizes: VecMap, + /// `seg_sizes` tracks the size of the relation at different points in time. + seg_sizes: VecMap, } impl DeltaLayerInner { - fn get_seg_size(&self, lsn: Lsn) -> Result { + fn get_seg_size(&self, lsn: Lsn) -> Result { let slice = self - .relsizes + .seg_sizes .slice_range((Included(&Lsn(0)), Included(&lsn))); if let Some((_entry_lsn, entry)) = slice.last() { Ok(*entry) @@ -195,14 +196,14 @@ impl Layer for DeltaLayer { /// Look up given page in the cache. fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result { let mut need_image = true; - assert!(self.seg.blknum_in_seg(blknum)); + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); match &cached_img_lsn { Some(cached_lsn) if &self.end_lsn <= cached_lsn => { @@ -261,7 +262,7 @@ impl Layer for DeltaLayer { if need_image && reconstruct_data.records.is_empty() && self.seg.rel.is_blocky() - && blknum - self.seg.segno * RELISH_SEG_SIZE >= inner.get_seg_size(lsn)? + && blknum >= inner.get_seg_size(lsn)? { return Ok(PageReconstructResult::Missing(self.start_lsn)); } @@ -279,7 +280,7 @@ impl Layer for DeltaLayer { } /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { + fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); ensure!( self.seg.rel.is_blocky(), @@ -309,7 +310,7 @@ impl Layer for DeltaLayer { fn unload(&self) -> Result<()> { let mut inner = self.inner.lock().unwrap(); inner.page_version_metas = VecMap::default(); - inner.relsizes = VecMap::default(); + inner.seg_sizes = VecMap::default(); inner.loaded = false; // Note: we keep the Book open. Is that a good idea? The virtual file @@ -340,9 +341,9 @@ impl Layer for DeltaLayer { self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn ); - println!("--- relsizes ---"); + println!("--- seg sizes ---"); let inner = self.load()?; - for (k, v) in inner.relsizes.as_slice() { + for (k, v) in inner.seg_sizes.as_slice() { println!(" {}: {}", k, v); } println!("--- page versions ---"); @@ -396,12 +397,12 @@ impl DeltaLayer { } } - /// Create a new delta file, using the given page versions and relsizes. + /// Create a new delta file, using the given page versions and seg_sizes. /// The page versions are passed in a PageVersions struct. If 'cutoff' is /// given, only page versions with LSN < cutoff are included. /// /// This is used to write the in-memory layer to disk. The page_versions and - /// relsizes are thus passed in the same format as they are in the in-memory + /// seg_sizes are thus passed in the same format as they are in the in-memory /// layer, as that's expedient. #[allow(clippy::too_many_arguments)] pub fn create( @@ -414,10 +415,10 @@ impl DeltaLayer { dropped: bool, page_versions: &PageVersions, cutoff: Option, - relsizes: VecMap, + seg_sizes: VecMap, ) -> Result { if seg.rel.is_blocky() { - assert!(!relsizes.is_empty()); + assert!(!seg_sizes.is_empty()); } let delta_layer = DeltaLayer { @@ -432,7 +433,7 @@ impl DeltaLayer { loaded: false, book: None, page_version_metas: VecMap::default(), - relsizes, + seg_sizes, }), }; let mut inner = delta_layer.inner.lock().unwrap(); @@ -471,9 +472,9 @@ impl DeltaLayer { chapter.write_all(&buf)?; let book = chapter.close()?; - // and relsizes to separate chapter - let mut chapter = book.new_chapter(REL_SIZES_CHAPTER); - let buf = VecMap::ser(&inner.relsizes)?; + // and seg_sizes to separate chapter + let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER); + let buf = VecMap::ser(&inner.seg_sizes)?; chapter.write_all(&buf)?; let book = chapter.close()?; @@ -550,13 +551,13 @@ impl DeltaLayer { let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?; let page_version_metas = VecMap::des(&chapter)?; - let chapter = book.read_chapter(REL_SIZES_CHAPTER)?; - let relsizes = VecMap::des(&chapter)?; + let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?; + let seg_sizes = VecMap::des(&chapter)?; debug!("loaded from {}", &path.display()); inner.page_version_metas = page_version_metas; - inner.relsizes = relsizes; + inner.seg_sizes = seg_sizes; inner.loaded = true; Ok(inner) @@ -581,7 +582,7 @@ impl DeltaLayer { loaded: false, book: None, page_version_metas: VecMap::default(), - relsizes: VecMap::default(), + seg_sizes: VecMap::default(), }), } } @@ -608,7 +609,7 @@ impl DeltaLayer { loaded: false, book: None, page_version_metas: VecMap::default(), - relsizes: VecMap::default(), + seg_sizes: VecMap::default(), }), }) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 22cf3347d8..3154934287 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -23,7 +23,7 @@ //! use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentTag, + Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, }; use crate::layered_repository::LayeredTimeline; use crate::layered_repository::RELISH_SEG_SIZE; @@ -99,7 +99,7 @@ pub struct ImageLayer { #[derive(Clone)] enum ImageType { - Blocky { num_blocks: u32 }, + Blocky { num_blocks: SegmentBlk }, NonBlocky, } @@ -144,11 +144,12 @@ impl Layer for ImageLayer { /// Look up given page in the file fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result { + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); assert!(lsn >= self.lsn); match cached_img_lsn { @@ -158,17 +159,15 @@ impl Layer for ImageLayer { let inner = self.load()?; - let base_blknum = blknum % RELISH_SEG_SIZE; - let buf = match &inner.image_type { ImageType::Blocky { num_blocks } => { // Check if the request is beyond EOF - if base_blknum >= *num_blocks { + if blknum >= *num_blocks { return Ok(PageReconstructResult::Missing(lsn)); } let mut buf = vec![0u8; BLOCK_SIZE]; - let offset = BLOCK_SIZE as u64 * base_blknum as u64; + let offset = BLOCK_SIZE as u64 * blknum as u64; let chapter = inner .book @@ -180,7 +179,7 @@ impl Layer for ImageLayer { buf } ImageType::NonBlocky => { - ensure!(base_blknum == 0); + ensure!(blknum == 0); inner .book .as_ref() @@ -195,7 +194,7 @@ impl Layer for ImageLayer { } /// Get size of the segment - fn get_seg_size(&self, _lsn: Lsn) -> Result { + fn get_seg_size(&self, _lsn: Lsn) -> Result { let inner = self.load()?; match inner.image_type { ImageType::Blocky { num_blocks } => Ok(num_blocks), @@ -276,7 +275,7 @@ impl ImageLayer { base_images: Vec, ) -> Result { let image_type = if seg.rel.is_blocky() { - let num_blocks: u32 = base_images.len().try_into()?; + let num_blocks: SegmentBlk = base_images.len().try_into()?; ImageType::Blocky { num_blocks } } else { assert_eq!(base_images.len(), 1); @@ -358,15 +357,11 @@ impl ImageLayer { let seg = src.get_seg_tag(); let timelineid = timeline.timelineid; - let startblk; - let size; - if seg.rel.is_blocky() { - size = src.get_seg_size(lsn)?; - startblk = seg.segno * RELISH_SEG_SIZE; + let size = if seg.rel.is_blocky() { + src.get_seg_size(lsn)? } else { - size = 1; - startblk = 0; - } + 1 + }; trace!( "creating new ImageLayer for {} on timeline {} at {}", @@ -376,7 +371,7 @@ impl ImageLayer { ); let mut base_images: Vec = Vec::new(); - for blknum in startblk..(startblk + size) { + for blknum in 0..size { let img = timeline.materialize_page(seg, blknum, lsn, &*src)?; base_images.push(img); @@ -435,7 +430,7 @@ impl ImageLayer { let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?; let images_len = chapter.len(); ensure!(images_len % BLOCK_SIZE as u64 == 0); - let num_blocks: u32 = (images_len / BLOCK_SIZE as u64).try_into()?; + let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?; ImageType::Blocky { num_blocks } } else { let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?; diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index dd02c8f951..4ae6e80824 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -7,7 +7,8 @@ use crate::layered_repository::ephemeral_file::EphemeralFile; use crate::layered_repository::filename::DeltaFileName; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE, + Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, + RELISH_SEG_SIZE, }; use crate::layered_repository::LayeredTimeline; use crate::layered_repository::ZERO_PAGE; @@ -64,13 +65,13 @@ pub struct InMemoryLayerInner { page_versions: PageVersions, /// - /// `segsizes` tracks the size of the segment at different points in time. + /// `seg_sizes` tracks the size of the segment at different points in time. /// /// For a blocky rel, there is always one entry, at the layer's start_lsn, /// so that determining the size never depends on the predecessor layer. For - /// a non-blocky rel, 'segsizes' is not used and is always empty. + /// a non-blocky rel, 'seg_sizes' is not used and is always empty. /// - segsizes: VecMap, + seg_sizes: VecMap, } impl InMemoryLayerInner { @@ -78,9 +79,9 @@ impl InMemoryLayerInner { assert!(self.end_lsn.is_none()); } - fn get_seg_size(&self, lsn: Lsn) -> u32 { + fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk { // Scan the BTreeMap backwards, starting from the given entry. - let slice = self.segsizes.slice_range(..=lsn); + let slice = self.seg_sizes.slice_range(..=lsn); // We make sure there is always at least one entry if let Some((_entry_lsn, entry)) = slice.last() { @@ -150,14 +151,14 @@ impl Layer for InMemoryLayer { /// Look up given page in the cache. fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result { let mut need_image = true; - assert!(self.seg.blknum_in_seg(blknum)); + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); { let inner = self.inner.read().unwrap(); @@ -198,7 +199,7 @@ impl Layer for InMemoryLayer { if need_image && reconstruct_data.records.is_empty() && self.seg.rel.is_blocky() - && blknum - self.seg.segno * RELISH_SEG_SIZE >= self.get_seg_size(lsn)? + && blknum >= self.get_seg_size(lsn)? { return Ok(PageReconstructResult::Missing(self.start_lsn)); } @@ -220,7 +221,7 @@ impl Layer for InMemoryLayer { } /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { + fn get_seg_size(&self, lsn: Lsn) -> Result { assert!(lsn >= self.start_lsn); ensure!( self.seg.rel.is_blocky(), @@ -291,8 +292,8 @@ impl Layer for InMemoryLayer { self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped, ); - for (k, v) in inner.segsizes.as_slice() { - println!("segsizes {}: {}", k, v); + for (k, v) in inner.seg_sizes.as_slice() { + println!("seg_sizes {}: {}", k, v); } for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) { @@ -339,10 +340,10 @@ impl InMemoryLayer { start_lsn ); - // The segment is initially empty, so initialize 'segsizes' with 0. - let mut segsizes = VecMap::default(); + // The segment is initially empty, so initialize 'seg_sizes' with 0. + let mut seg_sizes = VecMap::default(); if seg.rel.is_blocky() { - segsizes.append(start_lsn, 0).unwrap(); + seg_sizes.append(start_lsn, 0).unwrap(); } let file = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -359,7 +360,7 @@ impl InMemoryLayer { end_lsn: None, dropped: false, page_versions: PageVersions::new(file), - segsizes, + seg_sizes, }), }) } @@ -367,19 +368,19 @@ impl InMemoryLayer { // Write operations /// Remember new page version, as a WAL record over previous version - pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result { + pub fn put_wal_record(&self, lsn: Lsn, blknum: SegmentBlk, rec: WALRecord) -> Result { self.put_page_version(blknum, lsn, PageVersion::Wal(rec)) } /// Remember new page version, as a full page image - pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result { + pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result { self.put_page_version(blknum, lsn, PageVersion::Page(img)) } /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result { - assert!(self.seg.blknum_in_seg(blknum)); + pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result { + assert!((0..RELISH_SEG_SIZE).contains(&blknum)); trace!( "put_page_version blk {} of {} at {}/{}", @@ -404,7 +405,7 @@ impl InMemoryLayer { // Also update the relation size, if this extended the relation. if self.seg.rel.is_blocky() { - let newsize = blknum - self.seg.segno * RELISH_SEG_SIZE + 1; + let newsize = blknum + 1; // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock, // which we've just acquired above @@ -426,8 +427,7 @@ impl InMemoryLayer { // PostgreSQL writes its WAL records and there's no guarantee of it. If it does // happen, we would hit the "page version already exists" warning above on the // subsequent call to initialize the gap page. - let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize; - for gapblknum in gapstart..blknum { + for gapblknum in oldsize..blknum { let zeropv = PageVersion::Page(ZERO_PAGE.clone()); trace!( "filling gap blk {} with zeros for write of {}", @@ -441,13 +441,13 @@ impl InMemoryLayer { if old.is_some() { warn!( - "Page version of rel {} blk {} at {} already exists", - self.seg.rel, blknum, lsn + "Page version of seg {} blk {} at {} already exists", + self.seg, blknum, lsn ); } } - inner.segsizes.append_or_update_last(lsn, newsize).unwrap(); + inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap(); return Ok(newsize - oldsize); } } @@ -456,7 +456,7 @@ impl InMemoryLayer { } /// Remember that the relation was truncated at given LSN - pub fn put_truncation(&self, lsn: Lsn, segsize: u32) { + pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) { assert!( self.seg.rel.is_blocky(), "put_truncation() called on a non-blocky rel" @@ -466,10 +466,13 @@ impl InMemoryLayer { inner.assert_writeable(); // check that this we truncate to a smaller size than segment was before the truncation - let oldsize = inner.get_seg_size(lsn); - assert!(segsize < oldsize); + let old_size = inner.get_seg_size(lsn); + assert!(new_size < old_size); - let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap(); + let (old, _delta_size) = inner + .seg_sizes + .append_or_update_last(lsn, new_size) + .unwrap(); if old.is_some() { // We already had an entry for this LSN. That's odd.. @@ -515,10 +518,10 @@ impl InMemoryLayer { ); // Copy the segment size at the start LSN from the predecessor layer. - let mut segsizes = VecMap::default(); + let mut seg_sizes = VecMap::default(); if seg.rel.is_blocky() { let size = src.get_seg_size(start_lsn)?; - segsizes.append(start_lsn, size).unwrap(); + seg_sizes.append(start_lsn, size).unwrap(); } let file = EphemeralFile::create(conf, tenantid, timelineid)?; @@ -535,7 +538,7 @@ impl InMemoryLayer { end_lsn: None, dropped: false, page_versions: PageVersions::new(file), - segsizes, + seg_sizes, }), }) } @@ -558,7 +561,7 @@ impl InMemoryLayer { assert!(self.start_lsn < end_lsn + 1); inner.end_lsn = Some(Lsn(end_lsn.0 + 1)); - if let Some((lsn, _)) = inner.segsizes.as_slice().last() { + if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() { assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn); } @@ -606,7 +609,7 @@ impl InMemoryLayer { true, &inner.page_versions, None, - inner.segsizes.clone(), + inner.seg_sizes.clone(), )?; trace!( "freeze: created delta layer for dropped segment {} {}-{}", @@ -628,7 +631,7 @@ impl InMemoryLayer { let mut delta_layers = Vec::new(); if self.start_lsn != end_lsn_inclusive { - let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive); + let (seg_sizes, _) = inner.seg_sizes.split_at(&end_lsn_exclusive); // Write the page versions before the cutoff to disk. let delta_layer = DeltaLayer::create( self.conf, @@ -640,7 +643,7 @@ impl InMemoryLayer { false, &inner.page_versions, Some(end_lsn_inclusive), - segsizes, + seg_sizes, )?; delta_layers.push(delta_layer); trace!( diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 5988f28255..dab650d2ab 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -3,7 +3,7 @@ //! use crate::relish::RelishTag; -use crate::repository::WALRecord; +use crate::repository::{BlockNumber, WALRecord}; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; @@ -26,6 +26,18 @@ pub struct SegmentTag { pub segno: u32, } +/// SegmentBlk represents a block number within a segment, or the size of segment. +/// +/// This is separate from BlockNumber, which is used for block number within the +/// whole relish. Since this is just a type alias, the compiler will let you mix +/// them freely, but we use the type alias as documentation to make it clear +/// which one we're dealing with. +/// +/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally +/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes +/// operations more verbose). +pub type SegmentBlk = u32; + impl fmt::Display for SegmentTag { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}.{}", self.rel, self.segno) @@ -33,15 +45,16 @@ impl fmt::Display for SegmentTag { } impl SegmentTag { - pub const fn from_blknum(rel: RelishTag, blknum: u32) -> SegmentTag { - SegmentTag { - rel, - segno: blknum / RELISH_SEG_SIZE, - } - } - - pub fn blknum_in_seg(&self, blknum: u32) -> bool { - blknum / RELISH_SEG_SIZE == self.segno + /// Given a relish and block number, calculate the corresponding segment and + /// block number within the segment. + pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) { + ( + SegmentTag { + rel, + segno: blknum / RELISH_SEG_SIZE, + }, + blknum % RELISH_SEG_SIZE, + ) } } @@ -125,10 +138,6 @@ pub trait Layer: Send + Sync { /// It is up to the caller to collect more data from previous layer and /// perform WAL redo, if necessary. /// - /// Note that the 'blknum' is the offset of the page from the beginning - /// of the *relish*, not the beginning of the segment. The requested - /// 'blknum' must be covered by this segment. - /// /// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`. /// This function will only return data after `cached_img_lsn`. /// @@ -139,14 +148,14 @@ pub trait Layer: Send + Sync { /// to collect more data. fn get_page_reconstruct_data( &self, - blknum: u32, + blknum: SegmentBlk, lsn: Lsn, cached_img_lsn: Option, reconstruct_data: &mut PageReconstructData, ) -> Result; /// Return size of the segment at given LSN. (Only for blocky relations.) - fn get_seg_size(&self, lsn: Lsn) -> Result; + fn get_seg_size(&self, lsn: Lsn) -> Result; /// Does the segment exist at given LSN? Or was it dropped before it. fn get_seg_exists(&self, lsn: Lsn) -> Result; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 077969131f..74f2969ecd 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -10,6 +10,9 @@ use std::time::Duration; use zenith_utils::lsn::{Lsn, RecordLsn}; use zenith_utils::zid::ZTimelineId; +/// Block number within a relish. This matches PostgreSQL's BlockNumber type. +pub type BlockNumber = u32; + /// /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. @@ -180,10 +183,10 @@ pub trait Timeline: Send + Sync { fn wait_lsn(&self, lsn: Lsn) -> Result<()>; /// Look up given page version. - fn get_page_at_lsn(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result; + fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result; /// Get size of a relish - fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; + fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; /// Does relation exist? fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result; @@ -255,13 +258,25 @@ pub trait TimelineWriter: Deref { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>; + fn put_wal_record( + &self, + lsn: Lsn, + tag: RelishTag, + blknum: BlockNumber, + rec: WALRecord, + ) -> Result<()>; /// Like put_wal_record, but with ready-made image of the page. - fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>; + fn put_page_image( + &self, + tag: RelishTag, + blknum: BlockNumber, + lsn: Lsn, + img: Bytes, + ) -> Result<()>; /// Truncate relation - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>; + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: BlockNumber) -> Result<()>; /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; @@ -360,7 +375,7 @@ pub mod repo_harness { fn request_redo( &self, rel: RelishTag, - blknum: u32, + blknum: BlockNumber, lsn: Lsn, base_img: Option, records: Vec<(Lsn, WALRecord)>, @@ -690,7 +705,7 @@ mod tests { for blknum in 0..pg_constants::RELSEG_SIZE + 1 { let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); lsn += 0x10; - writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?; + writer.put_page_image(TESTREL_A, blknum as BlockNumber, Lsn(lsn), img)?; } writer.advance_last_record_lsn(Lsn(lsn)); @@ -726,11 +741,11 @@ mod tests { let mut size: i32 = 3000; while size >= 0 { lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?; + writer.put_truncation(TESTREL_A, Lsn(lsn), size as BlockNumber)?; writer.advance_last_record_lsn(Lsn(lsn)); assert_eq!( tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - size as u32 + size as BlockNumber ); size -= 1;