diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 44bcb60936..bc99b9bd56 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,7 +47,9 @@ mod storage_layer; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; use snapshot_layer::SnapshotLayer; -use storage_layer::{Layer, SegmentTag, RELISH_SEG_SIZE}; +use storage_layer::{Layer, PageReconstructData, SegmentTag, RELISH_SEG_SIZE}; + +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call. static TIMEOUT: Duration = Duration::from_secs(60); @@ -466,22 +468,9 @@ pub struct LayeredTimeline { impl Timeline for LayeredTimeline { /// Look up given page in the cache. fn get_page_at_lsn(&self, rel: RelishTag, blknum: u32, lsn: Lsn) -> Result { - if !rel.is_blocky() && blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - blknum, - rel - ); - } let lsn = self.wait_lsn(lsn)?; - let seg = SegmentTag::from_blknum(rel, blknum); - - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - layer.get_page_at_lsn(&*self.walredo_mgr, blknum, lsn) - } else { - bail!("relish {} not found at {}", rel, lsn); - } + self.get_page_at_lsn_nowait(rel, blknum, lsn) } fn get_page_at_lsn_nowait(&self, rel: RelishTag, blknum: u32, lsn: Lsn) -> Result { @@ -496,7 +485,7 @@ impl Timeline for LayeredTimeline { let seg = SegmentTag::from_blknum(rel, blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - layer.get_page_at_lsn(&*self.walredo_mgr, blknum, lsn) + self.materialize_page(seg, blknum, lsn, &*layer) } else { bail!("relish {} not found at {}", rel, lsn); } @@ -1040,7 +1029,7 @@ impl LayeredTimeline { ); layer = InMemoryLayer::copy_snapshot( self.conf, - &*self.walredo_mgr, + &self, &*prev_layer, self.timelineid, self.tenantid, @@ -1147,7 +1136,7 @@ impl LayeredTimeline { // Call unload() on all frozen layers, to release memory. for layer in old_layers.values() { if !layer.is_frozen() { - let new_layers = layer.freeze(last_valid_lsn, &*self.walredo_mgr)?; + let new_layers = layer.freeze(last_valid_lsn, &self)?; // replace this layer with the new layers that 'freeze' returned layers.remove(&**layer); @@ -1318,4 +1307,104 @@ impl LayeredTimeline { result.elapsed = now.elapsed(); Ok(result) } + + /// + /// Reconstruct a page version from given Layer + /// + fn materialize_page( + &self, + seg: SegmentTag, + blknum: u32, + lsn: Lsn, + layer: &dyn Layer, + ) -> Result { + let mut data = PageReconstructData { + records: Vec::new(), + page_img: None, + }; + + if let Some(_cont_lsn) = layer.get_page_reconstruct_data(blknum, lsn, &mut data)? { + // The layers are currently fully self-contained, so we should have found all + // the data we need to reconstruct the page in the layer. + if data.records.is_empty() { + // no records, and no base image. This can happen if PostgreSQL extends a relation + // but never writes the page. + // + // Would be nice to detect that situation better. + warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn); + return Ok(ZERO_PAGE.clone()); + } + bail!( + "No base image found for page {} blk {} at {}/{}", + seg.rel, + blknum, + self.timelineid, + lsn, + ); + } + self.reconstruct_page(seg.rel, blknum, lsn, data) + } + + /// + /// Reconstruct a page version, using the given base image and WAL records in 'data'. + /// + fn reconstruct_page( + &self, + rel: RelishTag, + blknum: u32, + request_lsn: Lsn, + mut data: PageReconstructData, + ) -> Result { + // Perform WAL redo if needed + data.records.reverse(); + + // If we have a page image, and no WAL, we're all set + if data.records.is_empty() { + if let Some(img) = &data.page_img { + trace!( + "found page image for blk {} in {} at {}/{}, no WAL redo required", + blknum, + rel, + self.timelineid, + request_lsn + ); + Ok(img.clone()) + } else { + // FIXME: this ought to be an error? + warn!("Page {} blk {} at {} not found", rel, blknum, request_lsn); + Ok(ZERO_PAGE.clone()) + } + } else { + // We need to do WAL redo. + // + // If we don't have a base image, then the oldest WAL record better initialize + // the page + if data.page_img.is_none() && !data.records.first().unwrap().will_init { + // FIXME: this ought to be an error? + warn!( + "Base image for page {}/{} at {} not found, but got {} WAL records", + rel, + blknum, + request_lsn, + data.records.len() + ); + Ok(ZERO_PAGE.clone()) + } else { + if data.page_img.is_some() { + trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn); + } else { + trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", data.records.len(), blknum, rel, self.timelineid, request_lsn); + } + let img = self.walredo_mgr.request_redo( + rel, + blknum, + request_lsn, + data.page_img.clone(), + data.records, + )?; + + Ok(img) + } + } + } } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index d59a82b8cd..3c0024bca6 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -3,14 +3,14 @@ //! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation. //! -use crate::layered_repository::storage_layer::{Layer, PageVersion, SegmentTag, RELISH_SEG_SIZE}; +use crate::layered_repository::storage_layer::{ + Layer, PageReconstructData, PageVersion, SegmentTag, RELISH_SEG_SIZE, +}; +use crate::layered_repository::LayeredTimeline; use crate::layered_repository::SnapshotLayer; -use crate::repository::WALRecord; -use crate::walredo::WalRedoManager; use crate::PageServerConf; use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, Result}; -use bytes::Bytes; use log::*; use std::collections::BTreeMap; use std::ops::Bound::Included; @@ -18,8 +18,6 @@ use std::sync::{Arc, Mutex}; use zenith_utils::lsn::Lsn; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, @@ -87,15 +85,13 @@ impl Layer for InMemoryLayer { } /// Look up given page in the cache. - fn get_page_at_lsn( + fn get_page_reconstruct_data( &self, - walredo_mgr: &dyn WalRedoManager, blknum: u32, lsn: Lsn, - ) -> Result { - // Scan the BTreeMap backwards, starting from the given entry. - let mut records: Vec = Vec::new(); - let mut page_img: Option = None; + reconstruct_data: &mut PageReconstructData, + ) -> Result> { + // Scan the BTreeMap backwards, starting from reconstruct_data.lsn. let mut need_base_image_lsn: Option = Some(lsn); assert!(self.seg.blknum_in_seg(blknum)); @@ -109,11 +105,11 @@ impl Layer for InMemoryLayer { .range((Included(&minkey), Included(&maxkey))); while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() { if let Some(img) = &entry.page_image { - page_img = Some(img.clone()); + reconstruct_data.page_img = Some(img.clone()); need_base_image_lsn = None; break; } else if let Some(rec) = &entry.record { - records.push(rec.clone()); + reconstruct_data.records.push(rec.clone()); if rec.will_init { // This WAL record initializes the page, so no need to go further back need_base_image_lsn = None; @@ -129,71 +125,8 @@ impl Layer for InMemoryLayer { // release lock on 'page_versions' } - records.reverse(); - // If we needed a base image to apply the WAL records against, we should have found it in memory. - if let Some(lsn) = need_base_image_lsn { - if records.is_empty() { - // no records, and no base image. This can happen if PostgreSQL extends a relation - // but never writes the page. - // - // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", self.seg.rel, blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - bail!( - "No base image found for page {} blk {} at {}/{}", - self.seg.rel, - blknum, - self.timelineid, - lsn - ); - } - - // If we have a page image, and no WAL, we're all set - if records.is_empty() { - if let Some(img) = page_img { - trace!( - "found page image for blk {} in {} at {}/{}, no WAL redo required", - blknum, - self.seg.rel, - self.timelineid, - lsn - ); - Ok(img) - } else { - // FIXME: this ought to be an error? - warn!("Page {} blk {} at {} not found", self.seg.rel, blknum, lsn); - Ok(ZERO_PAGE.clone()) - } - } else { - // We need to do WAL redo. - // - // If we don't have a base image, then the oldest WAL record better initialize - // the page - if page_img.is_none() && !records.first().unwrap().will_init { - // FIXME: this ought to be an error? - warn!( - "Base image for page {}/{} at {} not found, but got {} WAL records", - self.seg.rel, - blknum, - lsn, - records.len() - ); - Ok(ZERO_PAGE.clone()) - } else { - if page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.seg.rel, self.timelineid, lsn); - } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.seg.rel, self.timelineid, lsn); - } - let img = walredo_mgr.request_redo(self.seg.rel, blknum, lsn, page_img, records)?; - - self.put_page_image(blknum, lsn, img.clone())?; - - Ok(img) - } - } + Ok(need_base_image_lsn) } /// Get size of the relation at given LSN @@ -320,7 +253,8 @@ impl Layer for InMemoryLayer { fn freeze( &self, cutoff_lsn: Lsn, - walredo_mgr: &dyn WalRedoManager, + // This is needed just to call materialize_page() + timeline: &LayeredTimeline, ) -> Result>> { info!( "freezing in memory layer for {} on timeline {} at {}", @@ -398,7 +332,7 @@ impl Layer for InMemoryLayer { let new_layer = Self::copy_snapshot( self.conf, - walredo_mgr, + timeline, &snapfile, self.timelineid, self.tenantid, @@ -465,7 +399,7 @@ impl InMemoryLayer { /// pub fn copy_snapshot( conf: &'static PageServerConf, - walredo_mgr: &dyn WalRedoManager, + timeline: &LayeredTimeline, src: &dyn Layer, timelineid: ZTimelineId, tenantid: ZTenantId, @@ -494,7 +428,7 @@ impl InMemoryLayer { } for blknum in startblk..(startblk + size) { - let img = src.get_page_at_lsn(walredo_mgr, blknum, lsn)?; + let img = timeline.materialize_page(seg, blknum, lsn, src)?; let pv = PageVersion { page_image: Some(img), record: None, diff --git a/pageserver/src/layered_repository/snapshot_layer.rs b/pageserver/src/layered_repository/snapshot_layer.rs index 44695aa28b..83733d6cdb 100644 --- a/pageserver/src/layered_repository/snapshot_layer.rs +++ b/pageserver/src/layered_repository/snapshot_layer.rs @@ -37,15 +37,14 @@ //! A snapshot file is constructed using the 'bookfile' crate. Each file consists of two //! parts: the page versions and the relation sizes. They are stored as separate chapters. //! -use crate::layered_repository::storage_layer::ZERO_PAGE; -use crate::layered_repository::storage_layer::{Layer, PageVersion, SegmentTag}; +use crate::layered_repository::storage_layer::{ + Layer, PageReconstructData, PageVersion, SegmentTag, +}; +use crate::layered_repository::LayeredTimeline; use crate::relish::*; -use crate::repository::WALRecord; -use crate::walredo::WalRedoManager; use crate::PageServerConf; use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, Result}; -use bytes::Bytes; use log::*; use std::collections::BTreeMap; use std::fmt; @@ -270,15 +269,13 @@ impl Layer for SnapshotLayer { } /// Look up given page in the cache. - fn get_page_at_lsn( + fn get_page_reconstruct_data( &self, - walredo_mgr: &dyn WalRedoManager, blknum: u32, lsn: Lsn, - ) -> Result { + reconstruct_data: &mut PageReconstructData, + ) -> Result> { // Scan the BTreeMap backwards, starting from the given entry. - let mut records: Vec = Vec::new(); - let mut page_img: Option = None; let mut need_base_image_lsn: Option = Some(lsn); { let inner = self.load()?; @@ -289,11 +286,11 @@ impl Layer for SnapshotLayer { .range((Included(&minkey), Included(&maxkey))); while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() { if let Some(img) = &entry.page_image { - page_img = Some(img.clone()); + reconstruct_data.page_img = Some(img.clone()); need_base_image_lsn = None; break; } else if let Some(rec) = &entry.record { - records.push(rec.clone()); + reconstruct_data.records.push(rec.clone()); if rec.will_init { // This WAL record initializes the page, so no need to go further back need_base_image_lsn = None; @@ -309,73 +306,8 @@ impl Layer for SnapshotLayer { // release lock on 'inner' } - records.reverse(); - // If we needed a base image to apply the WAL records against, we should have found it in memory. - if let Some(lsn) = need_base_image_lsn { - if records.is_empty() { - // no records, and no base image. This can happen if PostgreSQL extends a relation - // but never writes the page. - // - // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", self.seg.rel, blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - bail!( - "No base image found for page {} blk {} at {}/{}", - self.seg.rel, - blknum, - self.timelineid, - lsn - ); - } - - // If we have a page image, and no WAL, we're all set - if records.is_empty() { - if let Some(img) = page_img { - trace!( - "found page image for blk {} in {} at {}/{}, no WAL redo required", - blknum, - self.seg.rel, - self.timelineid, - lsn - ); - Ok(img) - } else { - // FIXME: this ought to be an error? - warn!("Page {} blk {} at {} not found", self.seg.rel, blknum, lsn); - Ok(ZERO_PAGE.clone()) - } - } else { - // We need to do WAL redo. - // - // If we don't have a base image, then the oldest WAL record better initialize - // the page - if page_img.is_none() && !records.first().unwrap().will_init { - // FIXME: this ought to be an error? - warn!( - "Base image for page {} blk {} at {} not found, but got {} WAL records", - self.seg.rel, - blknum, - lsn, - records.len() - ); - Ok(ZERO_PAGE.clone()) - } else { - if page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.seg.rel, self.timelineid, lsn); - } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.seg.rel, self.timelineid, lsn); - } - let img = walredo_mgr.request_redo(self.seg.rel, blknum, lsn, page_img, records)?; - - // FIXME: Should we memoize the page image in memory, so that - // we wouldn't need to reconstruct it again, if it's requested again? - //self.put_page_image(blknum, lsn, img.clone())?; - - Ok(img) - } - } + Ok(need_base_image_lsn) } /// Get size of the relation at given LSN @@ -428,11 +360,7 @@ impl Layer for SnapshotLayer { bail!("cannot modify historical snapshot layer"); } - fn freeze( - &self, - _end_lsn: Lsn, - _walredo_mgr: &dyn WalRedoManager, - ) -> Result>> { + fn freeze(&self, _end_lsn: Lsn, _timeline: &LayeredTimeline) -> Result>> { bail!("cannot freeze historical snapshot layer"); } @@ -585,7 +513,7 @@ impl SnapshotLayer { Ok(inner) } - /// Create SnapshotLayers representing all files on dik + /// Create SnapshotLayers representing all files on disk /// // TODO: returning an Iterator would be more idiomatic pub fn list_snapshot_files( diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 461d3cdd25..0d0ac6164a 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -2,9 +2,9 @@ //! Common traits and structs for layers //! +use crate::layered_repository::LayeredTimeline; use crate::relish::RelishTag; use crate::repository::WALRecord; -use crate::walredo::WalRedoManager; use crate::ZTimelineId; use anyhow::Result; use bytes::Bytes; @@ -14,8 +14,6 @@ use std::sync::Arc; use zenith_utils::lsn::Lsn; -pub static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - // Size of one segment in pages (10 MB) pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; @@ -73,12 +71,25 @@ pub struct PageVersion { pub record: Option, } +/// +/// Data needed to reconstruct a page version +/// +/// 'page_img' is the old base image of the page to start the WAL replay with. +/// It can be None, if the first WAL record initializes the page (will_init) +/// 'records' contains the records to apply over the base image. +/// +pub struct PageReconstructData { + pub records: Vec, + pub page_img: Option, +} + /// /// A Layer holds all page versions for one segment of a relish, in a range of LSNs. /// There are two kinds of layers, in-memory and snapshot layers. In-memory /// layers are used to ingest incoming WAL, and provide fast access /// to the recent page versions. Snaphot layers are stored on disk, and -/// are immutable. +/// are immutable. This trait presents the common functionality of +/// in-memory and snapshot layers. /// /// Each layer contains a full snapshot of the segment at the start /// LSN. In addition to that, it contains WAL (or more page images) @@ -99,18 +110,26 @@ pub trait Layer: Send + Sync { /// in-memory layers are always unfrozen. fn is_frozen(&self) -> bool; - // Functions that correspond to the Timeline trait functions. - - // Note that the 'blknum' is the offset of the page from the beginning - // of the *relish*, not the beginning of the segment. The requested - // 'blknum' must be covered by this segment. - fn get_page_at_lsn( + /// + /// Return data needed to reconstruct given page at LSN. + /// + /// It is up to the caller to collect more data from previous layer and + /// perform WAL redo, if necessary. + /// + /// If returns Some, the returned data is not complete. The caller needs + /// to continue with the returned 'lsn'. + /// + /// Note that the 'blknum' is the offset of the page from the beginning + /// of the *relish*, not the beginning of the segment. The requested + /// 'blknum' must be covered by this segment. + fn get_page_reconstruct_data( &self, - walredo_mgr: &dyn WalRedoManager, blknum: u32, lsn: Lsn, - ) -> Result; + reconstruct_data: &mut PageReconstructData, + ) -> Result>; + // Functions that correspond to the Timeline trait functions. fn get_seg_size(&self, lsn: Lsn) -> Result; fn get_seg_exists(&self, lsn: Lsn) -> Result; @@ -150,8 +169,7 @@ pub trait Layer: Send + Sync { /// /// Returns new layers that replace this one. /// - fn freeze(&self, end_lsn: Lsn, walredo_mgr: &dyn WalRedoManager) - -> Result>>; + fn freeze(&self, end_lsn: Lsn, walredo_mgr: &LayeredTimeline) -> Result>>; /// Permanently delete this layer fn delete(&self) -> Result<()>;