diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 32fa2bc125..9cb5ab924a 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -17,9 +17,7 @@ use anyhow::Result; use clap::{App, Arg, ArgMatches}; use daemonize::Daemonize; -use slog::{Drain, FnValue}; - -use pageserver::{branches, page_cache, page_service}; +use pageserver::{branches, logger, page_cache, page_service}; use pageserver::{PageServerConf, RepositoryFormat}; const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000"; diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index e488d7f5bc..0d860837ab 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -179,31 +179,6 @@ fn bootstrap_timeline( info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn); - // We don't use page_cache here, because we don't want to spawn the WAL redo thread during - // repository initialization. - // - // FIXME: That caused trouble, because the WAL redo thread launched initdb in the background, - // and it kept running even after the "zenith init" had exited. In tests, we started the - // page server immediately after that, so that initdb was still running in the background, - // and we failed to run initdb again in the same directory. This has been solved for the - // rapid init+start case now, but the general race condition remains if you restart the - // server quickly. - let walredo_mgr = std::sync::Arc::new(crate::walredo::DummyRedoManager {}); - let repo: Box = match conf.repository_format { - crate::RepositoryFormat::Layered => Box::new( - crate::layered_repository::LayeredRepository::new(conf, walredo_mgr), - ), - crate::RepositoryFormat::RocksDb => { - let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?; - - Box::new(crate::object_repository::ObjectRepository::new( - conf, - std::sync::Arc::new(storage), - walredo_mgr, - )) - } - }; - let timeline = repo.create_empty_timeline(tli, lsn)?; restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?; timeline.checkpoint()?; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 3598c711b0..86c0b2804b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -16,14 +16,19 @@ //! - A snapshot layer doesn't contain a snapshot at a specific LSN, but all page //! versions in a range of LSNs. So each snapshot file has a start and end LSN. //! +//! +//! Each layer contains a full snapshot of the relish at the start LSN. In addition +//! to that, it contains WAL (or more page images) needed to recontruct any page +//! version up to the end LSN. +//! use anyhow::{bail, Context, Result}; use bytes::Bytes; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::{HashSet, BTreeSet}; use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeSet, HashSet}; use std::fs; use std::fs::File; use std::io::Write; @@ -31,11 +36,12 @@ use std::ops::Bound::Included; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; -use crate::repository::{GcResult, History, RelTag, Repository, Timeline, WALRecord}; +use crate::relish::*; +use crate::repository::{GcResult, History, Repository, Timeline, WALRecord}; use crate::restore_local_repo::import_timeline_wal; use crate::walredo::WalRedoManager; use crate::PageServerConf; -use crate::ZTimelineId; +use crate::{ZTimelineId, ZTenantId}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::{AtomicLsn, Lsn}; @@ -57,6 +63,7 @@ static TIMEOUT: Duration = Duration::from_secs(60); /// pub struct LayeredRepository { conf: &'static PageServerConf, + tenantid: ZTenantId, timelines: Mutex>>, walredo_mgr: Arc, @@ -77,22 +84,24 @@ impl Repository for LayeredRepository { ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); - std::fs::create_dir_all(self.conf.timeline_path(timelineid))?; + std::fs::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?; // Write initial metadata. let metadata = TimelineMetadata { last_valid_lsn: start_lsn, last_record_lsn: start_lsn, + prev_record_lsn: Lsn(0), ancestor_timeline: None, ancestor_lsn: start_lsn, }; - Self::save_metadata(self.conf, timelineid, &metadata)?; + Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata)?; let timeline = LayeredTimeline::new( self.conf, metadata, None, timelineid, + self.tenantid, self.walredo_mgr.clone(), )?; @@ -104,19 +113,19 @@ impl Repository for LayeredRepository { /// Branch a timeline fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> { - // just to check the source timeline exists - let _src_timeline = self.get_timeline(src)?; + let src_timeline = self.get_timeline(src)?; // Create the metadata file, noting the ancestor of th new timeline. There is initially // no data in it, but all the read-calls know to look into the ancestor. let metadata = TimelineMetadata { last_valid_lsn: start_lsn, last_record_lsn: start_lsn, + prev_record_lsn: src_timeline.get_prev_record_lsn(), ancestor_timeline: Some(src), ancestor_lsn: start_lsn, }; - std::fs::create_dir_all(self.conf.timeline_path(dst))?; - Self::save_metadata(self.conf, dst, &metadata)?; + std::fs::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?; + Self::save_metadata(self.conf, dst, self.tenantid, &metadata)?; info!("branched timeline {} from {} at {}", dst, src, start_lsn); @@ -136,7 +145,7 @@ impl LayeredRepository { match timelines.get(&timelineid) { Some(timeline) => Ok(timeline.clone()), None => { - let metadata = Self::load_metadata(self.conf, timelineid)?; + let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?; let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline { Some(self.get_timeline_locked(ancestor_timelineid, timelines)?) @@ -149,6 +158,7 @@ impl LayeredRepository { metadata, ancestor, timelineid, + self.tenantid, self.walredo_mgr.clone(), )?; @@ -158,7 +168,7 @@ impl LayeredRepository { timelineid, timeline.get_last_record_lsn() ); - let wal_dir = self.conf.timeline_path(timelineid).join("wal"); + let wal_dir = self.conf.timeline_path(&timelineid, &self.tenantid).join("wal"); import_timeline_wal(&wal_dir, &timeline, timeline.get_last_record_lsn())?; let timeline_rc = Arc::new(timeline); @@ -171,8 +181,10 @@ impl LayeredRepository { pub fn new( conf: &'static PageServerConf, walredo_mgr: Arc, + tenantid: ZTenantId, ) -> LayeredRepository { LayeredRepository { + tenantid: tenantid, conf: conf, timelines: Mutex::new(HashMap::new()), walredo_mgr, @@ -183,9 +195,10 @@ impl LayeredRepository { fn save_metadata( conf: &'static PageServerConf, timelineid: ZTimelineId, + tenantid: ZTenantId, data: &TimelineMetadata, ) -> Result<()> { - let path = conf.timeline_path(timelineid).join("metadata"); + let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); let mut file = File::create(&path)?; file.write_all(&TimelineMetadata::ser(data)?)?; @@ -196,8 +209,9 @@ impl LayeredRepository { fn load_metadata( conf: &'static PageServerConf, timelineid: ZTimelineId, + tenantid: ZTenantId, ) -> Result { - let path = conf.timeline_path(timelineid).join("metadata"); + let path = conf.timeline_path(&timelineid, &tenantid).join("metadata"); let data = std::fs::read(&path)?; Ok(TimelineMetadata::des(&data)?) @@ -230,8 +244,7 @@ impl LayeredRepository { // // - Currently, this is only triggered manually by the 'do_gc' command. // There is no background thread to do it automatically. - fn gc_iteration(conf: &'static PageServerConf, horizon: u64) -> Result { - + fn gc_iteration(conf: &'static PageServerConf, tenantid: ZTenantId, horizon: u64) -> Result { let mut totals: GcResult = Default::default(); let now = Instant::now(); @@ -248,9 +261,8 @@ impl LayeredRepository { let direntry = direntry?; if let Some(fname) = direntry.file_name().to_str() { if let Ok(timelineid) = fname.parse::() { - // Read the metadata of this timeline to get its parent timeline. - let metadata = Self::load_metadata(conf, timelineid)?; + let metadata = Self::load_metadata(conf, timelineid, tenantid)?; timelines.push((timelineid, metadata.last_record_lsn)); @@ -263,14 +275,16 @@ impl LayeredRepository { // Ok, we now know all the branch points. Iterate through them. for (timelineid, last_lsn) in timelines { - let branchpoints: Vec = all_branchpoints.range( - (Included((timelineid, Lsn(0))), - Included((timelineid, Lsn(u64::MAX))))) + let branchpoints: Vec = all_branchpoints + .range(( + Included((timelineid, Lsn(0))), + Included((timelineid, Lsn(u64::MAX))), + )) .map(|&x| x.1) .collect(); if let Some(cutoff) = last_lsn.checked_sub(horizon) { - let result = SnapshotLayer::gc_timeline(conf, timelineid, branchpoints, cutoff)?; + let result = SnapshotLayer::gc_timeline(conf, timelineid, tenantid, branchpoints, cutoff)?; totals += result; } @@ -281,9 +295,9 @@ impl LayeredRepository { } } -/// LayerMap is a BTreeMap keyed by RelTag and the layer's start LSN. +/// LayerMap is a BTreeMap keyed by RelishTag and the layer's start LSN. /// It provides a couple of convenience functions over a plain BTreeMap -struct LayerMap(BTreeMap<(RelTag, Lsn), Arc>); +struct LayerMap(BTreeMap<(RelishTag, Lsn), Arc>); impl LayerMap { /// @@ -292,7 +306,7 @@ impl LayerMap { /// given LSN, or precedes the given LSN, it is returned. In other words, /// you don't need to know the exact start LSN of the layer. /// - fn get(&self, tag: RelTag, lsn: Lsn) -> Option> { + fn get(&self, tag: RelishTag, lsn: Lsn) -> Option> { let startkey = (tag, Lsn(0)); let endkey = (tag, lsn); @@ -308,10 +322,10 @@ impl LayerMap { } fn insert(&mut self, layer: Arc) { - let tag = layer.get_tag(); + let rel = layer.get_relish_tag(); let start_lsn = layer.get_start_lsn(); - self.0.insert((tag, start_lsn), Arc::clone(&layer)); + self.0.insert((rel, start_lsn), Arc::clone(&layer)); } } @@ -326,6 +340,7 @@ impl Default for LayerMap { pub struct TimelineMetadata { last_valid_lsn: Lsn, last_record_lsn: Lsn, + prev_record_lsn: Lsn, ancestor_timeline: Option, ancestor_lsn: Lsn, } @@ -333,6 +348,7 @@ pub struct TimelineMetadata { pub struct LayeredTimeline { conf: &'static PageServerConf, + tenantid: ZTenantId, timelineid: ZTimelineId, layers: Mutex, @@ -359,6 +375,7 @@ pub struct LayeredTimeline { // last_valid_lsn: SeqWait, last_record_lsn: AtomicLsn, + prev_record_lsn: AtomicLsn, // Parent timeline that this timeline was branched from, and the LSN // of the branch point. @@ -369,18 +386,44 @@ pub struct LayeredTimeline { /// Public interface functions impl Timeline for LayeredTimeline { /// Look up given page in the cache. - fn get_page_at_lsn(&self, tag: BufferTag, lsn: Lsn) -> Result { - trace!("get_page_at_lsn: {:?} at {}", tag, lsn); + fn get_page_at_lsn(&self, rel: RelishTag, blknum: u32, lsn: Lsn) -> Result { + if !rel.is_blocky() && blknum != 0 { + bail!( + "invalid request for block {} for non-blocky relish {}", + blknum, + rel + ); + } let lsn = self.wait_lsn(lsn)?; - if let Some((layer, lsn)) = self.get_layer_for_read(tag.rel, lsn)? { - layer.get_page_at_lsn(&*self.walredo_mgr, tag.blknum, lsn) + if let Some((layer, lsn)) = self.get_layer_for_read(rel, lsn)? { + layer.get_page_at_lsn(&*self.walredo_mgr, blknum, lsn) } else { - bail!("relation {} not found at {}", tag.rel, lsn); + bail!("relish {} not found at {}", rel, lsn); } } - fn get_rel_size(&self, rel: RelTag, lsn: Lsn) -> Result { + fn get_page_at_lsn_nowait(&self, rel: RelishTag, blknum: u32, lsn: Lsn) -> Result { + if !rel.is_blocky() && blknum != 0 { + bail!( + "invalid request for block {} for non-blocky relish {}", + blknum, + rel + ); + } + + if let Some((layer, lsn)) = self.get_layer_for_read(rel, lsn)? { + layer.get_page_at_lsn(&*self.walredo_mgr, blknum, lsn) + } else { + bail!("relish {} not found at {}", rel, lsn); + } + } + + fn get_rel_size(&self, rel: RelishTag, lsn: Lsn) -> Result { + if !rel.is_blocky() { + bail!("invalid get_rel_size request for non-blocky relish {}", rel); + } + let lsn = self.wait_lsn(lsn)?; if let Some((layer, lsn)) = self.get_layer_for_read(rel, lsn)? { @@ -402,7 +445,7 @@ impl Timeline for LayeredTimeline { } } - fn get_rel_exists(&self, rel: RelTag, lsn: Lsn) -> Result { + fn get_rel_exists(&self, rel: RelishTag, lsn: Lsn) -> Result { let lsn = self.wait_lsn(lsn)?; let result; @@ -412,7 +455,7 @@ impl Timeline for LayeredTimeline { result = false; } - trace!("get_relsize_exists: {:?} at {} -> {:?}", rel, lsn, result); + trace!("get_relsize_exists: {} at {} -> {}", rel, lsn, result); Ok(result) } @@ -425,7 +468,7 @@ impl Timeline for LayeredTimeline { let mut all_rels = HashSet::new(); let mut timeline = self; loop { - let rels = SnapshotLayer::list_rels(self.conf, timeline.timelineid, spcnode, dbnode)?; + let rels = SnapshotLayer::list_rels(self.conf, timeline.timelineid, timeline.tenantid, spcnode, dbnode)?; // FIXME: We should filter out relations that don't exist at the given LSN. all_rels.extend(rels.iter()); @@ -441,12 +484,43 @@ impl Timeline for LayeredTimeline { Ok(all_rels) } + fn list_nonrels(&self, lsn: Lsn) -> Result> { + info!("list_nonrels called at {}", lsn); + + // List all rels in this timeline, and all its ancestors. + let mut all_rels = HashSet::new(); + let mut timeline = self; + loop { + // SnapshotFile::list_rels works by scanning the directory on disk. Make sure + // we have a file on disk for each relation. + // + // FIXME: I think checkpoint() assumes that there can be only one running + // at a time, and that no modifications are "put" to the timeline concurrently. + // this violates those assumptions. + timeline.checkpoint()?; + + let rels = SnapshotLayer::list_nonrels(self.conf, timeline.timelineid, timeline.tenantid, lsn)?; + + // FIXME: We should filter out relishes that don't exist at the given LSN. + all_rels.extend(rels.iter()); + + if let Some(ancestor) = timeline.ancestor_timeline.as_ref() { + timeline = ancestor; + continue; + } else { + break; + } + } + + Ok(all_rels) + } + fn history<'a>(&'a self) -> Result> { // TODO todo!(); } - fn gc_iteration(&self, horizon: u64) -> Result { + fn gc_iteration(&self, horizon: u64, _compact: bool) -> Result { // In the layered repository, event to GC a single timeline, // we have to scan all the timelines to determine what child // timelines there are, so that we know to retain snapshot @@ -460,37 +534,59 @@ impl Timeline for LayeredTimeline { // But do flush the in-memory layers to disk first. self.checkpoint()?; - LayeredRepository::gc_iteration(self.conf, horizon) + LayeredRepository::gc_iteration(self.conf, self.tenantid, horizon) } - fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) -> Result<()> { - debug!("put_wal_record: {:?} at {}", tag, rec.lsn); - - let layer = self.get_layer_for_write(tag.rel, rec.lsn)?; - layer.put_wal_record(tag.blknum, rec) + fn put_wal_record(&self, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> { + let layer = self.get_layer_for_write(rel, rec.lsn)?; + layer.put_wal_record(blknum, rec) } - fn put_truncation(&self, rel: RelTag, lsn: Lsn, relsize: u32) -> anyhow::Result<()> { - debug!("put_truncation: {:?} at {}", relsize, lsn); + fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: u32) -> anyhow::Result<()> { + if !rel.is_blocky() { + bail!("invalid truncation for non-blocky relish {}", rel); + } + + debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); let layer = self.get_layer_for_write(rel, lsn)?; layer.put_truncation(lsn, relsize) } - fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) -> Result<()> { - debug!("put_page_image: {:?} at {}", tag, lsn); - - let layer = self.get_layer_for_write(tag.rel, lsn)?; - layer.put_page_image(tag.blknum, lsn, img) + fn put_page_image( + &self, + rel: RelishTag, + blknum: u32, + lsn: Lsn, + img: Bytes, + _update_meta: bool, + ) -> Result<()> { + let layer = self.get_layer_for_write(rel, lsn)?; + layer.put_page_image(blknum, lsn, img) } - fn put_unlink(&self, rel: RelTag, lsn: Lsn) -> Result<()> { + fn put_unlink(&self, rel: RelishTag, lsn: Lsn) -> Result<()> { debug!("put_unlink: {} at {}", rel, lsn); let layer = self.get_layer_for_write(rel, lsn)?; layer.put_unlink(lsn) } + fn put_raw_data(&self, _tag: crate::object_key::ObjectTag, _lsn: Lsn, _data: &[u8]) -> Result<()> { + + // FIXME: This doesn't make much sense for the layered storage format, + // it's pretty tightly coupled with the way the object store stores + // things. + bail!("put_raw_data not implemented"); + } + + fn get_next_tag( + &self, + _tag: crate::object_key::ObjectTag, + ) -> Result> { + todo!(); + } + /// /// Flush to disk all data that was written with the put_* functions /// @@ -525,7 +621,7 @@ impl Timeline for LayeredTimeline { let layers = std::mem::take(&mut *layers); for layer in layers.0.values() { if !layer.is_frozen() { - layer.freeze(last_valid_lsn + 1)?; + layer.freeze(last_valid_lsn)?; } } @@ -540,10 +636,11 @@ impl Timeline for LayeredTimeline { let metadata = TimelineMetadata { last_valid_lsn: self.last_valid_lsn.load(), last_record_lsn: self.last_record_lsn.load(), + prev_record_lsn: self.prev_record_lsn.load(), ancestor_timeline: ancestor_timelineid, ancestor_lsn: self.ancestor_lsn, }; - LayeredRepository::save_metadata(self.conf, self.timelineid, &metadata)?; + LayeredRepository::save_metadata(self.conf, self.timelineid, self.tenantid, &metadata)?; // If there were any concurrent updates on the timeline, we would have to work // harder to make sure we don't lose the new updates. Currently, that shouldn't @@ -572,6 +669,7 @@ impl Timeline for LayeredTimeline { assert!(old == Lsn(0)); let old = self.last_record_lsn.fetch_max(lsn); assert!(old == Lsn(0)); + self.prev_record_lsn.store(Lsn(0)); } fn get_last_valid_lsn(&self) -> Lsn { @@ -588,6 +686,9 @@ impl Timeline for LayeredTimeline { let old = self.last_record_lsn.fetch_max(lsn); assert!(old <= lsn); + // Use old value of last_record_lsn as prev_record_lsn + self.prev_record_lsn.fetch_max(old); + // Also advance last_valid_lsn let old = self.last_valid_lsn.advance(lsn); // Can't move backwards. @@ -602,6 +703,10 @@ impl Timeline for LayeredTimeline { fn get_last_record_lsn(&self) -> Lsn { self.last_record_lsn.load() } + + fn get_prev_record_lsn(&self) -> Lsn { + self.prev_record_lsn.load() + } } impl LayeredTimeline { @@ -613,17 +718,20 @@ impl LayeredTimeline { metadata: TimelineMetadata, ancestor: Option>, timelineid: ZTimelineId, + tenantid: ZTenantId, walredo_mgr: Arc, ) -> Result { let timeline = LayeredTimeline { conf, timelineid, + tenantid, layers: Mutex::new(LayerMap::default()), walredo_mgr, last_valid_lsn: SeqWait::new(metadata.last_valid_lsn), last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0), + prev_record_lsn: AtomicLsn::new(metadata.prev_record_lsn.0), ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn, @@ -639,7 +747,7 @@ impl LayeredTimeline { /// fn get_layer_for_read( &self, - tag: RelTag, + rel: RelishTag, lsn: Lsn, ) -> Result, Lsn)>> { // First dig the right ancestor timeline @@ -647,7 +755,7 @@ impl LayeredTimeline { let mut lsn = lsn; trace!( "get_layer_for_read called for {} at {}/{}", - tag, + rel, self.timelineid, lsn ); @@ -661,46 +769,82 @@ impl LayeredTimeline { } loop { - // Then look up the layer let mut layers = timeline.layers.lock().unwrap(); - - // FIXME: If there is an entry in memory for an older snapshot file, - // but there is a newere snapshot file on disk, this will incorrectly - // return the older entry from memory. // // FIXME: If the relation has been dropped, does this return the right // thing? The compute node should not normally request dropped relations, // but if OID wraparound happens the same relfilenode might get reused // for an unrelated relation. // - if let Some(layer) = layers.get(tag, lsn) { - trace!("found layer in memory: {}-{}", layer.get_start_lsn(), layer.get_end_lsn()); - return Ok(Some((layer.clone(), lsn))); - } else { - // No layer in memory for this relation yet. Read it from disk. - if let Some(layer) = - SnapshotLayer::load(timeline.conf, timeline.timelineid, tag, lsn)? - { - trace!( - "found snapshot file on disk: {}-{}", - layer.get_start_lsn(), - layer.get_end_lsn() - ); - let layer_rc: Arc = Arc::new(layer); - layers.insert(Arc::clone(&layer_rc)); + let mut best_candidate = None; + let mut best_end_lsn = Lsn(0); - return Ok(Some((layer_rc, lsn))); - } else { - // No snapshot files for this relation on this timeline. But there might still - // be one on the ancestor timeline - if let Some(ancestor) = &timeline.ancestor_timeline { - lsn = timeline.ancestor_lsn; - timeline = &ancestor.as_ref(); - trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); - continue; - } - return Ok(None); + // First, see if we have loaded a layer in the cache ready. + if let Some(layer) = layers.get(rel, lsn) { + trace!( + "found layer in cache: {} {}-{}", + timeline.timelineid, + layer.get_start_lsn(), + layer.get_end_lsn() + ); + + assert!(layer.get_start_lsn() <= lsn); + + // If this layer's LSN range contains the request LSN, it is an "exact" match, + // and we can return it directly. If it's not an exact match there might be + // a more recent layer on disk than what we have in cache. + // + // For example, imagine that the following snapshot files exist: + // + // 100-200 [cached] + // 200-300 + // 300-400 + // + // A request comes in for LSN 250. We already have the layer 100-200 in cache, + // so we find it here. But there's a newer layer on disk for 200-300, that's the + // correct one we need to return from this function. If the 200-300 snapshot file + // didn't exist (because there were no modifications to the relation after LSN + // 200), then the 100-200 layer was the correct one + // + // So if we find a layer in cache with end-LSN before the request LSN, remember + // that, but fall through to check if there is a newer snapshot file on disk before + // returning it. + if layer.get_end_lsn() >= lsn { + return Ok(Some((layer.clone(), lsn))); } + best_candidate = Some(layer.clone()); + best_end_lsn = layer.get_end_lsn(); + } + + // Proceed to check if there is a (better) snapshot file on disk. + if let Some(layer) = + SnapshotLayer::load(timeline.conf, timeline.timelineid, timeline.tenantid, rel, best_end_lsn, lsn)? + { + trace!( + "found snapshot file on disk: {}-{}", + layer.get_start_lsn(), + layer.get_end_lsn() + ); + let layer_rc: Arc = Arc::new(layer); + layers.insert(Arc::clone(&layer_rc)); + + return Ok(Some((layer_rc, lsn))); + } else { + // No (better) snapshot files for this relation on this timeline. If we found + // something in cache, return that. + if let Some(result) = best_candidate { + return Ok(Some((result, lsn))); + } + + // If we got nothing on this timeline, check if there's a layer on the ancestor + // timeline + if let Some(ancestor) = &timeline.ancestor_timeline { + lsn = timeline.ancestor_lsn; + timeline = &ancestor.as_ref(); + trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); + continue; + } + return Ok(None); } } } @@ -708,14 +852,14 @@ impl LayeredTimeline { /// /// Get a handle to the latest layer for appending. /// - fn get_layer_for_write(&self, tag: RelTag, lsn: Lsn) -> Result> { + fn get_layer_for_write(&self, rel: RelishTag, lsn: Lsn) -> Result> { if lsn < self.last_valid_lsn.load() { bail!("cannot modify relation after advancing last_valid_lsn"); } // Look up the snapshot file let layers = self.layers.lock().unwrap(); - if let Some(layer) = layers.get(tag, lsn) { + if let Some(layer) = layers.get(rel, lsn) { if !layer.is_frozen() { return Ok(Arc::clone(&layer)); } @@ -733,7 +877,7 @@ impl LayeredTimeline { drop(layers); let layer; - if let Some((prev_layer, _prev_lsn)) = self.get_layer_for_read(tag, lsn)? { + if let Some((prev_layer, _prev_lsn)) = self.get_layer_for_read(rel, lsn)? { // Create new entry after the previous one. let lsn; if prev_layer.get_timeline_id() != self.timelineid { @@ -741,7 +885,7 @@ impl LayeredTimeline { lsn = self.ancestor_lsn; trace!( "creating file for write for {} at branch point {}/{}", - tag, + rel, self.timelineid, lsn ); @@ -749,7 +893,7 @@ impl LayeredTimeline { lsn = prev_layer.get_end_lsn(); trace!( "creating file for write for {} after previous layer {}/{}", - tag, + rel, self.timelineid, lsn ); @@ -765,13 +909,14 @@ impl LayeredTimeline { &*self.walredo_mgr, &*prev_layer, self.timelineid, + self.tenantid, lsn, )?; } else { // New relation. trace!( "creating layer for write for new rel {} at {}/{}", - tag, + rel, self.timelineid, lsn ); @@ -782,7 +927,9 @@ impl LayeredTimeline { if let Some((_start, end, dropped)) = SnapshotLayer::find_latest_snapshot_file( self.conf, self.timelineid, - tag, + self.tenantid, + rel, + Lsn(0), Lsn(u64::MAX), )? { if dropped { @@ -793,7 +940,7 @@ impl LayeredTimeline { } else { start_lsn = lsn; } - layer = InMemoryLayer::create(self.conf, self.timelineid, tag, start_lsn)?; + layer = InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, rel, start_lsn)?; } let mut layers = self.layers.lock().unwrap(); diff --git a/pageserver/src/layered_repository/README.md b/pageserver/src/layered_repository/README.md index 404c925eb1..457c05e934 100644 --- a/pageserver/src/layered_repository/README.md +++ b/pageserver/src/layered_repository/README.md @@ -12,11 +12,11 @@ are stored in the timeline's subdirectory under .zenith/timelines. The files are named like this: - _____ + rel______ For example: - 1663_13990_2609_0_000000000169C348_0000000001702000 + rel_1663_13990_2609_0_000000000169C348_0000000001702000 Each snapshot file contains a full snapshot, that is, full copy of all pages in the relation, as of the "start LSN". It also contains all WAL @@ -27,13 +27,20 @@ version of the relation in the LSN range. If a file has been dropped, the last snapshot file for it is created with the _DROPPED suffix, e.g. - 1663_13990_2609_0_000000000169C348_0000000001702000_DROPPED + rel_1663_13990_2609_0_000000000169C348_0000000001702000_DROPPED + +In addition to the relations, with "rel_*" prefix, we use the same +format for storing various smaller files from the PostgreSQL data +directory. They will use different suffixes and the naming scheme +up to the LSN range varies. The Zenith source code uses the term +"relish" to mean "a relation, or other file that's treated like a +relation in the storage" ## Notation used in this document The full path of a snapshot file looks like this: - .zenith/timelines/4af489b06af8eed9e27a841775616962/1663_13990_2609_0_000000000169C348_0000000001702000 + .zenith/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_000000000169C348_0000000001702000 For simplicity, the examples below use a simplified notation for the paths. The timeline ID is replaced with the human-readable branch diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index e74fb99df1..e0672b46fa 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -6,10 +6,11 @@ use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::PageVersion; use crate::layered_repository::SnapshotLayer; -use crate::repository::{RelTag, WALRecord}; +use crate::relish::*; +use crate::repository::WALRecord; use crate::walredo::WalRedoManager; use crate::PageServerConf; -use crate::ZTimelineId; +use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, Result}; use bytes::Bytes; use log::*; @@ -23,8 +24,9 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); pub struct InMemoryLayer { conf: &'static PageServerConf, + tenantid: ZTenantId, timelineid: ZTimelineId, - tag: RelTag, + rel: RelishTag, /// /// This layer contains all the changes from 'start_lsn'. The @@ -35,7 +37,6 @@ pub struct InMemoryLayer { // FIXME: the three mutex-protected fields below should probably be protected // by a single mutex. - /// If this relation was dropped, remember when that happened. Lsn(0) means /// it hasn't been dropped drop_lsn: Mutex, @@ -61,8 +62,8 @@ impl Layer for InMemoryLayer { return self.timelineid; } - fn get_tag(&self) -> RelTag { - return self.tag; + fn get_relish_tag(&self) -> RelishTag { + return self.rel; } fn get_start_lsn(&self) -> Lsn { @@ -120,12 +121,12 @@ impl Layer for InMemoryLayer { // but never writes the page. // // Would be nice to detect that situation better. - warn!("Page {:?}/{} at {} not found", self.tag, blknum, lsn); + warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn); return Ok(ZERO_PAGE.clone()); } bail!( "No base image found for page {} blk {} at {}/{}", - self.tag, + self.rel, blknum, self.timelineid, lsn @@ -138,14 +139,14 @@ impl Layer for InMemoryLayer { trace!( "found page image for blk {} in {} at {}/{}, no WAL redo required", blknum, - self.tag, + self.rel, self.timelineid, lsn ); Ok(img) } else { // FIXME: this ought to be an error? - warn!("Page {:?}/{} at {} not found", self.tag, blknum, lsn); + warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn); Ok(ZERO_PAGE.clone()) } } else { @@ -156,8 +157,8 @@ impl Layer for InMemoryLayer { if page_img.is_none() && !records.first().unwrap().will_init { // FIXME: this ought to be an error? warn!( - "Base image for page {:?}/{} at {} not found, but got {} WAL records", - self.tag, + "Base image for page {}/{} at {} not found, but got {} WAL records", + self.rel, blknum, lsn, records.len() @@ -165,17 +166,11 @@ impl Layer for InMemoryLayer { Ok(ZERO_PAGE.clone()) } else { if page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.tag, self.timelineid, lsn); + trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn); } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.tag, self.timelineid, lsn); + trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn); } - let img = walredo_mgr.request_redo( - self.rel, - blknum, - lsn, - page_img, - records, - )?; + let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?; self.put_page_image(blknum, lsn, img.clone())?; @@ -191,12 +186,14 @@ impl Layer for InMemoryLayer { let mut iter = relsizes.range((Included(&Lsn(0)), Included(&lsn))); if let Some((_entry_lsn, entry)) = iter.next_back() { - trace!("get_relsize: {} at {} -> {}", self.tag, lsn, *entry); - Ok(*entry) + let result = *entry; + drop(relsizes); + trace!("get_relsize: {} at {} -> {}", self.rel, lsn, result); + Ok(result) } else { bail!( "No size found for relfile {:?} at {} in memory", - self.tag, + self.rel, lsn ); } @@ -225,7 +222,7 @@ impl Layer for InMemoryLayer { trace!( "put_page_version blk {} of {} at {}/{}", blknum, - self.tag, + self.rel, self.timelineid, lsn ); @@ -237,7 +234,7 @@ impl Layer for InMemoryLayer { // We already had an entry for this LSN. That's odd.. warn!( "Page version of rel {:?} blk {} at {} already exists", - self.tag, blknum, lsn + self.rel, blknum, lsn ); } @@ -259,7 +256,7 @@ impl Layer for InMemoryLayer { if blknum >= oldsize { trace!( "enlarging relation {} from {} to {} blocks", - self.tag, + self.rel, oldsize, blknum + 1 ); @@ -285,13 +282,12 @@ impl Layer for InMemoryLayer { /// Remember that the relation was truncated at given LSN fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> { - let mut drop_lsn = self.drop_lsn.lock().unwrap(); assert!(*drop_lsn == Lsn(0)); *drop_lsn = lsn; - info!("dropped relation {} at {}", self.tag, lsn); + info!("dropped relation {} at {}", self.rel, lsn); Ok(()) } @@ -300,6 +296,11 @@ impl Layer for InMemoryLayer { /// Write the this in-memory layer to disk, as a snapshot layer. /// fn freeze(&self, end_lsn: Lsn) -> Result<()> { + info!( + "freezing in memory layer for {} on timeline {} at {}", + self.rel, self.timelineid, end_lsn + ); + let page_versions = self.page_versions.lock().unwrap(); let relsizes = self.relsizes.lock().unwrap(); let drop_lsn = self.drop_lsn.lock().unwrap(); @@ -312,18 +313,18 @@ impl Layer for InMemoryLayer { let dropped = *drop_lsn != Lsn(0); - let end_lsn = - if dropped { - assert!(*drop_lsn < end_lsn); - *drop_lsn - } else { - end_lsn - }; + let end_lsn = if dropped { + assert!(*drop_lsn < end_lsn); + *drop_lsn + } else { + end_lsn + }; let _snapfile = SnapshotLayer::create( self.conf, self.timelineid, - self.tag, + self.tenantid, + self.rel, self.start_lsn, end_lsn, dropped, @@ -342,18 +343,22 @@ impl InMemoryLayer { pub fn create( conf: &'static PageServerConf, timelineid: ZTimelineId, - tag: RelTag, + tenantid: ZTenantId, + rel: RelishTag, start_lsn: Lsn, ) -> Result { - debug!( - "initializing new InMemoryLayer for writing {} on timeline {}", - tag, timelineid + trace!( + "initializing new InMemoryLayer for writing {} on timeline {} at {}", + rel, + timelineid, + start_lsn ); Ok(InMemoryLayer { conf, timelineid, - tag, + tenantid, + rel, start_lsn, drop_lsn: Mutex::new(Lsn(0)), page_versions: Mutex::new(BTreeMap::new()), @@ -370,12 +375,14 @@ impl InMemoryLayer { walredo_mgr: &dyn WalRedoManager, src: &dyn Layer, timelineid: ZTimelineId, + tenantid: ZTenantId, lsn: Lsn, ) -> Result { - debug!( - "initializing new InMemoryLayer for writing {} on timeline {}", - src.get_tag(), - timelineid + trace!( + "initializing new InMemoryLayer for writing {} on timeline {} at {}", + src.get_relish_tag(), + timelineid, + lsn ); let mut page_versions = BTreeMap::new(); let mut relsizes = BTreeMap::new(); @@ -395,11 +402,39 @@ impl InMemoryLayer { Ok(InMemoryLayer { conf, timelineid, - tag: src.get_tag(), + tenantid, + rel: src.get_relish_tag(), start_lsn: lsn, drop_lsn: Mutex::new(Lsn(0)), page_versions: Mutex::new(page_versions), relsizes: Mutex::new(relsizes), }) } + + /// debugging function to print out the contents of the layer + #[allow(unused)] + pub fn dump(&self) -> String { + let mut result = format!( + "----- inmemory layer for {} {}-> ----\n", + self.rel, self.start_lsn + ); + + let relsizes = self.relsizes.lock().unwrap(); + let page_versions = self.page_versions.lock().unwrap(); + + for (k, v) in relsizes.iter() { + result += &format!("{}: {}\n", k, v); + } + for (k, v) in page_versions.iter() { + result += &format!( + "blk {} at {}: {}/{}\n", + k.0, + k.1, + v.page_image.is_some(), + v.record.is_some() + ); + } + + result + } } diff --git a/pageserver/src/layered_repository/snapshot_layer.rs b/pageserver/src/layered_repository/snapshot_layer.rs index 7906bb991f..dd45d8399d 100644 --- a/pageserver/src/layered_repository/snapshot_layer.rs +++ b/pageserver/src/layered_repository/snapshot_layer.rs @@ -10,7 +10,7 @@ //! a new snapshot file is created for writing, the full contents of relation are //! materialized as it is at the beginning of the LSN range. That can be very expensive, //! we should find a way to store differential files. But this keeps the read-side -//! of things simple. You can find the correct snapshot file based on RelTag and +//! of things simple. You can find the correct snapshot file based on RelishTag and //! timeline+LSN, and once you've located it, you have all the data you need to in that //! file. //! @@ -39,18 +39,21 @@ //! use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::storage_layer::PageVersion; -use crate::repository::{GcResult, RelTag, WALRecord}; +use crate::layered_repository::storage_layer::ZERO_PAGE; +use crate::relish::*; +use crate::repository::{GcResult, WALRecord}; use crate::walredo::WalRedoManager; use crate::PageServerConf; -use crate::ZTimelineId; +use crate::{ZTimelineId, ZTenantId}; use anyhow::{anyhow, bail, Result}; use bytes::Bytes; use log::*; -use std::collections::{BTreeMap, HashSet, BTreeSet}; +use std::collections::{BTreeMap, BTreeSet, HashSet}; +use std::fmt; use std::fs; use std::fs::File; use std::io::Write; -use std::ops::Bound::Included; +use std::ops::Bound::{Excluded, Included, Unbounded}; use std::path::PathBuf; use std::sync::Mutex; use std::time::Instant; @@ -60,14 +63,146 @@ use bookfile::{Book, BookWriter}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - // Magic constant to identify a Zenith snapshot file static SNAPSHOT_FILE_MAGIC: u32 = 0x5A616E01; static PAGE_VERSIONS_CHAPTER: u64 = 1; static REL_SIZES_CHAPTER: u64 = 2; +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +struct SnapshotFileName { + rel: RelishTag, + start_lsn: Lsn, + end_lsn: Lsn, + dropped: bool, +} + +impl SnapshotFileName { + fn from_str(fname: &str) -> Option { + // Split the filename into parts + // + // _____ + // + // or if it was dropped: + // + // ______DROPPED + // + let rel; + let mut parts; + if let Some(rest) = fname.strip_prefix("rel_") { + parts = rest.split('_'); + rel = RelishTag::Relation(RelTag { + spcnode: parts.next()?.parse::().ok()?, + dbnode: parts.next()?.parse::().ok()?, + relnode: parts.next()?.parse::().ok()?, + forknum: parts.next()?.parse::().ok()?, + }); + } else if let Some(rest) = fname.strip_prefix("pg_xact_") { + parts = rest.split('_'); + rel = RelishTag::Slru { + slru: SlruKind::Clog, + segno: u32::from_str_radix(parts.next()?, 16).ok()?, + }; + } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { + parts = rest.split('_'); + rel = RelishTag::Slru { + slru: SlruKind::MultiXactMembers, + segno: u32::from_str_radix(parts.next()?, 16).ok()?, + }; + } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { + parts = rest.split('_'); + rel = RelishTag::Slru { + slru: SlruKind::MultiXactOffsets, + segno: u32::from_str_radix(parts.next()?, 16).ok()?, + }; + } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { + parts = rest.split('_'); + rel = RelishTag::FileNodeMap { + spcnode: parts.next()?.parse::().ok()?, + dbnode: parts.next()?.parse::().ok()?, + }; + } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { + parts = rest.split('_'); + rel = RelishTag::TwoPhase { + xid: parts.next()?.parse::().ok()?, + }; + } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { + parts = rest.split('_'); + rel = RelishTag::Checkpoint; + } else if let Some(rest) = fname.strip_prefix("pg_control_") { + parts = rest.split('_'); + rel = RelishTag::ControlFile; + } else { + return None; + } + + let start_lsn = Lsn::from_hex(parts.next()?).ok()?; + let end_lsn = Lsn::from_hex(parts.next()?).ok()?; + + let mut dropped = false; + if let Some(suffix) = parts.next() { + if suffix == "DROPPED" { + dropped = true; + } else { + warn!("unrecognized filename in timeline dir: {}", fname); + return None; + } + } + if parts.next().is_some() { + warn!("unrecognized filename in timeline dir: {}", fname); + return None; + } + + Some(SnapshotFileName { + rel, + start_lsn, + end_lsn, + dropped, + }) + } + + fn to_string(&self) -> String { + let basename = match self.rel { + RelishTag::Relation(reltag) => format!( + "rel_{}_{}_{}_{}", + reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum + ), + RelishTag::Slru { + slru: SlruKind::Clog, + segno, + } => format!("pg_xact_{:04X}", segno), + RelishTag::Slru { + slru: SlruKind::MultiXactMembers, + segno, + } => format!("pg_multixact_members_{:04X}", segno), + RelishTag::Slru { + slru: SlruKind::MultiXactOffsets, + segno, + } => format!("pg_multixact_offsets_{:04X}", segno), + RelishTag::FileNodeMap { spcnode, dbnode } => { + format!("pg_filenodemap_{}_{}", spcnode, dbnode) + } + RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), + RelishTag::Checkpoint => format!("pg_control_checkpoint"), + RelishTag::ControlFile => format!("pg_control"), + }; + + format!( + "{}_{:016X}_{:016X}{}", + basename, + u64::from(self.start_lsn), + u64::from(self.end_lsn), + if self.dropped { "_DROPPED" } else { "" } + ) + } +} + +impl fmt::Display for SnapshotFileName { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.to_string()) + } +} + /// /// SnapshotLayer is the in-memory data structure associated with an on-disk snapshot file. /// It is also used to accumulate new changes at the tip of a branch; end_lsn is u64::MAX @@ -75,8 +210,9 @@ static REL_SIZES_CHAPTER: u64 = 2; /// pub struct SnapshotLayer { conf: &'static PageServerConf, + pub tenantid: ZTenantId, pub timelineid: ZTimelineId, - pub tag: RelTag, + pub rel: RelishTag, // // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The @@ -107,8 +243,8 @@ impl Layer for SnapshotLayer { return self.timelineid; } - fn get_tag(&self) -> RelTag { - return self.tag; + fn get_relish_tag(&self) -> RelishTag { + return self.rel; } fn get_start_lsn(&self) -> Lsn { @@ -166,12 +302,12 @@ impl Layer for SnapshotLayer { // but never writes the page. // // Would be nice to detect that situation better. - warn!("Page {:?}/{} at {} not found", self.tag, blknum, lsn); + warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn); return Ok(ZERO_PAGE.clone()); } bail!( "No base image found for page {} blk {} at {}/{}", - self.tag, + self.rel, blknum, self.timelineid, lsn @@ -184,14 +320,14 @@ impl Layer for SnapshotLayer { trace!( "found page image for blk {} in {} at {}/{}, no WAL redo required", blknum, - self.tag, + self.rel, self.timelineid, lsn ); Ok(img) } else { // FIXME: this ought to be an error? - warn!("Page {:?}/{} at {} not found", self.tag, blknum, lsn); + warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn); Ok(ZERO_PAGE.clone()) } } else { @@ -202,8 +338,8 @@ impl Layer for SnapshotLayer { if page_img.is_none() && !records.first().unwrap().will_init { // FIXME: this ought to be an error? warn!( - "Base image for page {:?}/{} at {} not found, but got {} WAL records", - self.tag, + "Base image for page {} blk {} at {} not found, but got {} WAL records", + self.rel, blknum, lsn, records.len() @@ -211,17 +347,11 @@ impl Layer for SnapshotLayer { Ok(ZERO_PAGE.clone()) } else { if page_img.is_some() { - trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.tag, self.timelineid, lsn); + trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn); } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.tag, self.timelineid, lsn); + trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn); } - let img = walredo_mgr.request_redo( - self.tag, - blknum, - lsn, - page_img, - records, - )?; + let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?; // FIXME: Should we memoize the page image in memory, so that // we wouldn't need to reconstruct it again, if it's requested again? @@ -239,14 +369,12 @@ impl Layer for SnapshotLayer { let mut iter = relsizes.range((Included(&Lsn(0)), Included(&lsn))); if let Some((_entry_lsn, entry)) = iter.next_back() { - trace!("get_relsize: {} at {} -> {}", self.tag, lsn, *entry); - Ok(*entry) + let result = *entry; + drop(relsizes); + trace!("get_relsize: {} at {} -> {}", self.rel, lsn, result); + Ok(result) } else { - bail!( - "No size found for relfile {:?} at {} in memory", - self.tag, - lsn - ); + bail!("No size found for {} at {} in memory", self.rel, lsn); } } @@ -269,7 +397,7 @@ impl Layer for SnapshotLayer { fn put_page_version(&self, blknum: u32, lsn: Lsn, _pv: PageVersion) -> Result<()> { panic!( "cannot modify historical snapshot file, rel {} blk {} at {}/{}, {}-{}", - self.tag, blknum, self.timelineid, lsn, self.start_lsn, self.end_lsn + self.rel, blknum, self.timelineid, lsn, self.start_lsn, self.end_lsn ); } fn put_truncation(&self, _lsn: Lsn, _relsize: u32) -> anyhow::Result<()> { @@ -290,33 +418,23 @@ impl SnapshotLayer { Self::path_for( self.conf, self.timelineid, - self.tag, - self.start_lsn, - self.end_lsn, - self.dropped, + self.tenantid, + &SnapshotFileName { + rel: self.rel, + start_lsn: self.start_lsn, + end_lsn: self.end_lsn, + dropped: self.dropped, + }, ) } fn path_for( conf: &'static PageServerConf, timelineid: ZTimelineId, - tag: RelTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool + tenantid: ZTenantId, + fname: &SnapshotFileName, ) -> PathBuf { - let fname = format!( - "{}_{}_{}_{}_{:016X}_{:016X}{}", - tag.spcnode, - tag.dbnode, - tag.relnode, - tag.forknum, - u64::from(start_lsn), - u64::from(end_lsn), - if dropped { "_DROPPED" } else { "" }, - ); - - conf.timeline_path(timelineid).join(&fname) + conf.timeline_path(&timelineid, &tenantid).join(fname.to_string()) } /// Create a new snapshot file, using the given btreemaps containing the page versions and @@ -328,7 +446,8 @@ impl SnapshotLayer { pub fn create( conf: &'static PageServerConf, timelineid: ZTimelineId, - tag: RelTag, + tenantid: ZTenantId, + rel: RelishTag, start_lsn: Lsn, end_lsn: Lsn, dropped: bool, @@ -338,7 +457,8 @@ impl SnapshotLayer { let snapfile = SnapshotLayer { conf: conf, timelineid: timelineid, - tag: tag, + tenantid: tenantid, + rel: rel, start_lsn: start_lsn, end_lsn, dropped, @@ -377,7 +497,7 @@ impl SnapshotLayer { book.close()?; - debug!("saved {}", &path.display()); + trace!("saved {}", &path.display()); Ok(()) } @@ -388,26 +508,24 @@ impl SnapshotLayer { pub fn find_latest_snapshot_file( conf: &'static PageServerConf, timelineid: ZTimelineId, - tag: RelTag, + tenantid: ZTenantId, + rel: RelishTag, + earliest_lsn: Lsn, lsn: Lsn, ) -> Result> { // Scan the timeline directory to get all rels in this timeline. - let path = conf.timeline_path(timelineid); let mut result_start_lsn = Lsn(0); let mut result_end_lsn = Lsn(0); let mut result_dropped = false; - for direntry in fs::read_dir(path)? { - let direntry = direntry?; + for fname in Self::list_snapshot_files(conf, timelineid, tenantid)? { + if fname.end_lsn <= earliest_lsn { + continue; + } - let fname = direntry.file_name(); - let fname = fname.to_str().unwrap(); - - if let Some((reltag, start_lsn, end_lsn, dropped)) = Self::fname_to_tag(fname) { - if reltag == tag && start_lsn <= lsn && start_lsn > result_start_lsn { - result_start_lsn = start_lsn; - result_end_lsn = end_lsn; - result_dropped = dropped; - } + if fname.rel == rel && fname.start_lsn <= lsn && fname.end_lsn > result_end_lsn { + result_start_lsn = fname.start_lsn; + result_end_lsn = fname.end_lsn; + result_dropped = fname.dropped; } } if result_start_lsn != Lsn(0) { @@ -420,18 +538,20 @@ impl SnapshotLayer { /// /// Load the state for one relation back into memory. /// - /// Returns the latest snapshot file that before the given 'lsn'. + /// Returns the latest snapshot file that before the given 'lsn', but newer than 'earliest_lsn' /// pub fn load( conf: &'static PageServerConf, timelineid: ZTimelineId, - tag: RelTag, + tenantid: ZTenantId, + rel: RelishTag, + earliest_lsn: Lsn, lsn: Lsn, ) -> Result> { if let Some((start_lsn, end_lsn, dropped)) = - Self::find_latest_snapshot_file(conf, timelineid, tag, lsn)? + Self::find_latest_snapshot_file(conf, timelineid, tenantid, rel, earliest_lsn, lsn)? { - let snap = Self::load_path(conf, timelineid, tag, start_lsn, end_lsn, dropped)?; + let snap = Self::load_path(conf, timelineid, tenantid, rel, start_lsn, end_lsn, dropped)?; Ok(Some(snap)) } else { Ok(None) @@ -441,12 +561,23 @@ impl SnapshotLayer { fn load_path( conf: &'static PageServerConf, timelineid: ZTimelineId, - tag: RelTag, + tenantid: ZTenantId, + rel: RelishTag, start_lsn: Lsn, end_lsn: Lsn, dropped: bool, ) -> Result { - let path = Self::path_for(conf, timelineid, tag, start_lsn, end_lsn, dropped); + let path = Self::path_for( + conf, + timelineid, + tenantid, + &SnapshotFileName { + rel, + start_lsn, + end_lsn, + dropped, + }, + ); let file = File::open(&path)?; let mut book = Book::new(file)?; @@ -468,7 +599,8 @@ impl SnapshotLayer { Ok(SnapshotLayer { conf, timelineid, - tag, + tenantid, + rel, start_lsn, end_lsn, dropped, @@ -480,21 +612,15 @@ impl SnapshotLayer { pub fn list_rels( conf: &'static PageServerConf, timelineid: ZTimelineId, + tenantid: ZTenantId, spcnode: u32, dbnode: u32, ) -> Result> { let mut rels: HashSet = HashSet::new(); // Scan the timeline directory to get all rels in this timeline. - let path = conf.timeline_path(timelineid); - for direntry in fs::read_dir(path)? { - let direntry = direntry?; - - let fname = direntry.file_name(); - let fname = fname.to_str().unwrap(); - - if let Some((reltag, _start_lsn, _end_lsn, _dropped)) = Self::fname_to_tag(fname) { - + for snapfiles in Self::list_snapshot_files(conf, timelineid, tenantid)? { + if let RelishTag::Relation(reltag) = snapfiles.rel { // FIXME: skip if it was dropped before the requested LSN. But there is no // LSN argument @@ -508,44 +634,26 @@ impl SnapshotLayer { Ok(rels) } - fn fname_to_tag(fname: &str) -> Option<(RelTag, Lsn, Lsn, bool)> { - // Split the filename into parts - // - // _____ - // - // or if it was dropped: - // - // ______DROPPED - // - let mut parts = fname.split('_'); + pub fn list_nonrels( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + _lsn: Lsn, + ) -> Result> { + let mut rels: HashSet = HashSet::new(); - let reltag = RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }; - let start_lsn = Lsn::from_hex(parts.next()?).ok()?; - let end_lsn = Lsn::from_hex(parts.next()?).ok()?; + // Scan the timeline directory to get all rels in this timeline. + for snapfile in Self::list_snapshot_files(conf, timelineid, tenantid)? { + // FIXME: skip if it was dropped before the requested LSN. - let mut dropped = false; - if let Some(suffix) = parts.next() { - if suffix == "DROPPED" { - dropped = true; + if let RelishTag::Relation(_) = snapfile.rel { } else { - warn!("unrecognized filename in timeline dir: {}", fname); - return None; + rels.insert(snapfile.rel); } } - if parts.next().is_some() { - warn!("unrecognized filename in timeline dir: {}", fname); - return None; - } - - Some((reltag, start_lsn, end_lsn, dropped)) + Ok(rels) } - /// /// Garbage collect snapshot files on a timeline that are no longer needed. /// @@ -568,72 +676,143 @@ impl SnapshotLayer { /// within a snapshot file. We can only remove the whole file if it's fully /// obsolete. /// - pub fn gc_timeline(conf: &'static PageServerConf, - timelineid: ZTimelineId, - retain_lsns: Vec, - cutoff: Lsn) -> Result { - + pub fn gc_timeline( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + retain_lsns: Vec, + cutoff: Lsn, + ) -> Result { let now = Instant::now(); let mut result: GcResult = Default::default(); // Scan all snapshot files in the directory. For each file, if a newer file // exists, we can remove the old one. - // For convenience and speed, slurp the list of files in the directoy into memory first. - let mut snapfiles: BTreeSet<(RelTag, Lsn, Lsn, bool)> = BTreeSet::new(); + // For convenience and speed, slurp the list of files in the directory into memory first. + let mut snapfiles: BTreeSet = BTreeSet::new(); - let timeline_path = conf.timeline_path(timelineid); - for direntry in fs::read_dir(timeline_path)? { - let direntry = direntry?; - let fname = direntry.file_name(); - let fname = fname.to_str().unwrap(); + for fname in Self::list_snapshot_files(conf, timelineid, tenantid)? { + snapfiles.insert(fname.clone()); - if let Some((reltag, start_lsn, end_lsn, dropped)) = Self::fname_to_tag(fname) { - snapfiles.insert((reltag, start_lsn, end_lsn, dropped)); + if fname.rel.is_relation() { + result.snapshot_relfiles_total += 1; + } else { + result.snapshot_nonrelfiles_total += 1; } - result.snapshot_files_total += 1; } // Now determine for each file if it needs to be retained - 'outer: for (reltag, start_lsn, end_lsn, dropped) in &snapfiles { - + 'outer: for snapfile in &snapfiles { // Is it newer than cutoff point? - if *end_lsn > cutoff { - result.snapshot_files_needed_by_cutoff += 1; + if snapfile.end_lsn > cutoff { + if snapfile.rel.is_relation() { + result.snapshot_relfiles_needed_by_cutoff += 1; + } else { + result.snapshot_nonrelfiles_needed_by_cutoff += 1; + } continue 'outer; } // Is it needed by a child branch? for retain_lsn in &retain_lsns { // FIXME: are the bounds inclusive or exclusive? - if *start_lsn <= *retain_lsn && *retain_lsn <= *end_lsn { - result.snapshot_files_needed_by_branches += 1; + if snapfile.start_lsn <= *retain_lsn && *retain_lsn <= snapfile.end_lsn { + if snapfile.rel.is_relation() { + result.snapshot_relfiles_needed_by_branches += 1; + } else { + result.snapshot_nonrelfiles_needed_by_branches += 1; + } continue 'outer; } } // Unless the relation was dropped, is there a later snapshot file for this relation? - if !dropped && snapfiles.range( - (Included((*reltag, *end_lsn, Lsn(0), false)), - Included((*reltag, Lsn(u64::MAX), Lsn(0), true)))).next().is_none() { - // there is no later file, so keep it - result.snapshot_files_not_updated += 1; - continue 'outer; + if !snapfile.dropped { + let mut found_later_file = false; + if let Some(other_snapfile) = + snapfiles.range((Excluded(snapfile), Unbounded)).next() + { + if other_snapfile.rel != snapfile.rel { + // walked past the files for this rel. So there is no later file. + } else { + // found a later file. + found_later_file = true; + } + } + + if !found_later_file { + if snapfile.rel.is_relation() { + result.snapshot_relfiles_not_updated += 1; + } else { + result.snapshot_nonrelfiles_not_updated += 1; + } + continue 'outer; + } } // We didn't find any reason to keep this file, so remove it. - let path = Self::path_for(conf, timelineid, *reltag, *start_lsn, *end_lsn, *dropped); + let path = Self::path_for(conf, timelineid, tenantid, snapfile); info!("garbage collecting {}", path.display()); fs::remove_file(path)?; - if *dropped { - result.snapshot_files_dropped += 1; + if snapfile.dropped { + if snapfile.rel.is_relation() { + result.snapshot_relfiles_dropped += 1; + } else { + result.snapshot_nonrelfiles_dropped += 1; + } } else { - result.snapshot_files_removed += 1; + if snapfile.rel.is_relation() { + result.snapshot_relfiles_removed += 1; + } else { + result.snapshot_nonrelfiles_removed += 1; + } } } result.elapsed = now.elapsed(); Ok(result) } + + // TODO: returning an Iterator would be more idiomatic + fn list_snapshot_files( + conf: &'static PageServerConf, + timelineid: ZTimelineId, + tenantid: ZTenantId, + ) -> Result> { + let path = conf.timeline_path(&timelineid, &tenantid); + + let mut snapfiles = Vec::new(); + for direntry in fs::read_dir(path)? { + let fname = direntry?.file_name(); + let fname = fname.to_str().unwrap(); + + if let Some(snapfilename) = SnapshotFileName::from_str(fname) { + snapfiles.push(snapfilename); + } + } + return Ok(snapfiles); + } + + /// debugging function to print out the contents of the layer + #[allow(unused)] + pub fn dump(&self) -> String { + let mut result = format!( + "----- snapshot layer for {} {}-{} ----\n", + self.rel, self.start_lsn, self.end_lsn + ); + + let relsizes = self.relsizes.lock().unwrap(); + //let page_versions = self.page_versions.lock().unwrap(); + + for (k, v) in relsizes.iter() { + result += &format!("{}: {}\n", k, v); + } + //for (k, v) in page_versions.iter() { + // result += &format!("blk {} at {}: {}/{}\n", k.0, k.1, v.page_image.is_some(), v.record.is_some()); + //} + + result + } } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index fa96296d09..3c15b131a8 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -1,4 +1,4 @@ -use crate::repository::RelTag; +use crate::relish::RelishTag; use crate::repository::WALRecord; use crate::walredo::WalRedoManager; use crate::ZTimelineId; @@ -8,6 +8,8 @@ use serde::{Deserialize, Serialize}; use zenith_utils::lsn::Lsn; +pub static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); + /// /// Represents a version of a page at a specific LSN. The LSN is the key of the /// entry in the 'page_versions' hash, it is not duplicated here. @@ -27,7 +29,7 @@ pub trait Layer: Send + Sync { fn is_frozen(&self) -> bool; fn get_timeline_id(&self) -> ZTimelineId; - fn get_tag(&self) -> RelTag; + fn get_relish_tag(&self) -> RelishTag; fn get_start_lsn(&self) -> Lsn; fn get_end_lsn(&self) -> Lsn; diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index 84c0b40b52..ecc756a4b0 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -28,16 +28,18 @@ pub fn init(conf: &'static PageServerConf) { for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() { let tenantid = ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap(); - let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap(); // Set up a WAL redo manager, for applying WAL records. let walredo_mgr = PostgresRedoManager::new(conf, tenantid); // Set up an object repository, for actual data storage. let repo: Arc = match conf.repository_format { - RepositoryFormat::Layered => Arc::new(LayeredRepository::new(conf, Arc::new(walredo_mgr))), + RepositoryFormat::Layered => Arc::new(LayeredRepository::new(conf, + Arc::new(walredo_mgr), + tenantid + )), RepositoryFormat::RocksDb => { - let obj_store = RocksObjectStore::open(conf).unwrap(); + let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap(); Arc::new(ObjectRepository::new( conf, @@ -49,7 +51,7 @@ pub fn init(conf: &'static PageServerConf) { }; info!("initialized storage for tenant: {}", &tenantid); - m.insert(tenantid, Arc::new(repo)); + m.insert(tenantid, repo); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3c57974396..90c3e5514b 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -550,14 +550,18 @@ impl postgres_backend::Handler for PageServerHandler { RowDescriptor::int8_col(b"slru_deleted"), RowDescriptor::int8_col(b"chkp_deleted"), RowDescriptor::int8_col(b"dropped"), - - RowDescriptor::int8_col(b"snapshot_files_total"), - RowDescriptor::int8_col(b"snapshot_files_needed_by_cutoff"), - RowDescriptor::int8_col(b"snapshot_files_needed_by_branches"), - RowDescriptor::int8_col(b"snapshot_files_not_updated"), - RowDescriptor::int8_col(b"snapshot_files_removed"), - RowDescriptor::int8_col(b"snapshot_files_dropped"), - + RowDescriptor::int8_col(b"snapshot_relfiles_total"), + RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"), + RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"), + RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"), + RowDescriptor::int8_col(b"snapshot_relfiles_removed"), + RowDescriptor::int8_col(b"snapshot_relfiles_dropped"), + RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"), + RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"), + RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"), + RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"), + RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"), + RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"), RowDescriptor::int8_col(b"elapsed"), ]))? .write_message_noflush(&BeMessage::DataRow(&[ @@ -568,14 +572,43 @@ impl postgres_backend::Handler for PageServerHandler { Some(&result.slru_deleted.to_string().as_bytes()), Some(&result.chkp_deleted.to_string().as_bytes()), Some(&result.dropped.to_string().as_bytes()), - - Some(&result.snapshot_files_total.to_string().as_bytes()), - Some(&result.snapshot_files_needed_by_cutoff.to_string().as_bytes()), - Some(&result.snapshot_files_needed_by_branches.to_string().as_bytes()), - Some(&result.snapshot_files_not_updated.to_string().as_bytes()), - Some(&result.snapshot_files_removed.to_string().as_bytes()), - Some(&result.snapshot_files_dropped.to_string().as_bytes()), - + Some(&result.snapshot_relfiles_total.to_string().as_bytes()), + Some( + &result + .snapshot_relfiles_needed_by_cutoff + .to_string() + .as_bytes(), + ), + Some( + &result + .snapshot_relfiles_needed_by_branches + .to_string() + .as_bytes(), + ), + Some(&result.snapshot_relfiles_not_updated.to_string().as_bytes()), + Some(&result.snapshot_relfiles_removed.to_string().as_bytes()), + Some(&result.snapshot_relfiles_dropped.to_string().as_bytes()), + Some(&result.snapshot_nonrelfiles_total.to_string().as_bytes()), + Some( + &result + .snapshot_nonrelfiles_needed_by_cutoff + .to_string() + .as_bytes(), + ), + Some( + &result + .snapshot_nonrelfiles_needed_by_branches + .to_string() + .as_bytes(), + ), + Some( + &result + .snapshot_nonrelfiles_not_updated + .to_string() + .as_bytes(), + ), + Some(&result.snapshot_nonrelfiles_removed.to_string().as_bytes()), + Some(&result.snapshot_nonrelfiles_dropped.to_string().as_bytes()), Some(&result.elapsed.as_millis().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs index 782388871b..3b3573766d 100644 --- a/pageserver/src/relish.rs +++ b/pageserver/src/relish.rs @@ -53,10 +53,7 @@ pub enum RelishTag { // relish. For example, pg_clog/0000, pg_clog/0001, and so forth. // // SLRU segments are divided into blocks, like relations. - Slru { - slru: SlruKind, - segno: u32, - }, + Slru { slru: SlruKind, segno: u32 }, // Miscellaneous other files that need to be included in the // tarball at compute node creation. These are non-blocky, and are @@ -76,17 +73,12 @@ pub enum RelishTag { // These files are always 512 bytes long (although we don't check // or care about that in the page server). // - FileNodeMap { - spcnode: Oid, - dbnode: Oid, - }, + FileNodeMap { spcnode: Oid, dbnode: Oid }, // // State files for prepared transactions (e.g pg_twophase/1234) // - TwoPhase { - xid: TransactionId, - }, + TwoPhase { xid: TransactionId }, // The control file, stored in global/pg_control ControlFile, @@ -114,6 +106,15 @@ impl RelishTag { | RelishTag::Checkpoint => false, } } + + // convenience function to check if this relish is a normal relation. + pub const fn is_relation(&self) -> bool { + if let RelishTag::Relation(_) = self { + true + } else { + false + } + } } /// diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 8a737d8acd..855ab79da0 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -9,7 +9,6 @@ use postgres_ffi::TransactionId; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use std::iter::Iterator; -use std::fmt; use std::ops::AddAssign; use std::sync::Arc; use std::time::Duration; @@ -53,17 +52,23 @@ pub struct GcResult { pub dropped: u64, // These are used for the LayeredRepository instead - pub snapshot_files_total: u64, - pub snapshot_files_needed_by_cutoff: u64, - pub snapshot_files_needed_by_branches: u64, - pub snapshot_files_not_updated: u64, - pub snapshot_files_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files. - pub snapshot_files_dropped: u64, // # of snapshot files removed because the relation was dropped + pub snapshot_relfiles_total: u64, + pub snapshot_relfiles_needed_by_cutoff: u64, + pub snapshot_relfiles_needed_by_branches: u64, + pub snapshot_relfiles_not_updated: u64, + pub snapshot_relfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files. + pub snapshot_relfiles_dropped: u64, // # of snapshot files removed because the relation was dropped + + pub snapshot_nonrelfiles_total: u64, + pub snapshot_nonrelfiles_needed_by_cutoff: u64, + pub snapshot_nonrelfiles_needed_by_branches: u64, + pub snapshot_nonrelfiles_not_updated: u64, + pub snapshot_nonrelfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files. + pub snapshot_nonrelfiles_dropped: u64, // # of snapshot files removed because the relation was dropped pub elapsed: Duration, } - impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { self.n_relations += other.n_relations; @@ -71,12 +76,20 @@ impl AddAssign for GcResult { self.deleted += other.deleted; self.dropped += other.dropped; - self.snapshot_files_total += other.snapshot_files_total; - self.snapshot_files_needed_by_cutoff += other.snapshot_files_needed_by_cutoff; - self.snapshot_files_needed_by_branches += other.snapshot_files_needed_by_branches; - self.snapshot_files_not_updated += other.snapshot_files_not_updated; - self.snapshot_files_removed += other.snapshot_files_removed; - self.snapshot_files_dropped += other.snapshot_files_dropped; + self.snapshot_relfiles_total += other.snapshot_relfiles_total; + self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff; + self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches; + self.snapshot_relfiles_not_updated += other.snapshot_relfiles_not_updated; + self.snapshot_relfiles_removed += other.snapshot_relfiles_removed; + self.snapshot_relfiles_dropped += other.snapshot_relfiles_dropped; + + self.snapshot_nonrelfiles_total += other.snapshot_nonrelfiles_total; + self.snapshot_nonrelfiles_needed_by_cutoff += other.snapshot_nonrelfiles_needed_by_cutoff; + self.snapshot_nonrelfiles_needed_by_branches += + other.snapshot_nonrelfiles_needed_by_branches; + self.snapshot_nonrelfiles_not_updated += other.snapshot_nonrelfiles_not_updated; + self.snapshot_nonrelfiles_removed += other.snapshot_nonrelfiles_removed; + self.snapshot_nonrelfiles_dropped += other.snapshot_nonrelfiles_dropped; self.elapsed += other.elapsed; } @@ -318,6 +331,8 @@ mod tests { buf.freeze() } + static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); + fn get_test_repo( test_name: &str, repository_format: RepositoryFormat, @@ -548,6 +563,10 @@ mod tests { let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap(); let tline = repo.create_empty_timeline(timelineid, Lsn(0))?; + // Import initial dummy checkpoint record, otherwise the get_timeline() call + // after branching fails below + tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(1), ZERO_PAGE.clone(), false)?; + // Create a relation on the timeline tline.init_valid_lsn(Lsn(1)); tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?; diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py index 268ac14411..d7a0984690 100644 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ b/test_runner/batch_others/test_snapfiles_gc.py @@ -5,7 +5,9 @@ import time; pytest_plugins = ("fixtures.zenith_fixtures") def print_gc_result(row): - print("GC duration {elapsed} ms, total: {snapshot_files_total}, needed_by_cutoff {snapshot_files_needed_by_cutoff}, needed_by_branches: {snapshot_files_needed_by_branches}, not_updated: {snapshot_files_not_updated}, removed: {snapshot_files_removed}, dropped: {snapshot_files_dropped}".format_map(row)) + print("GC duration {elapsed} ms".format_map(row)); + print(" REL total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row)) + print(" NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row)) # @@ -44,20 +46,20 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin): row = pscur.fetchone() print_gc_result(row); # remember the number of files - snapshot_files_total = row['snapshot_files_total'] - assert snapshot_files_total > 0 + snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed'] + assert snapshot_relfiles_remain > 0 # Insert a row. The first insert will also create a metadata entry for the # relation, with size == 1 block. Hence, bump up the expected relation count. - snapshot_files_total += 1; + snapshot_relfiles_remain += 1; print("Inserting one row and running GC") cur.execute("INSERT INTO foo VALUES (1)") pscur.execute(f"do_gc {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['snapshot_files_total'] == snapshot_files_total - assert row['snapshot_files_removed'] == 0 - assert row['snapshot_files_dropped'] == 0 + assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + assert row['snapshot_relfiles_removed'] == 0 + assert row['snapshot_relfiles_dropped'] == 0 # Insert two more rows and run GC. # This should create a new snapshot file with the new contents, and @@ -69,9 +71,9 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin): pscur.execute(f"do_gc {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['snapshot_files_total'] == snapshot_files_total + 1 - assert row['snapshot_files_removed'] == 1 - assert row['snapshot_files_dropped'] == 0 + assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1 + assert row['snapshot_relfiles_removed'] == 1 + assert row['snapshot_relfiles_dropped'] == 0 # Do it again. Should again create a new snapshot file and remove old one. print("Inserting two more rows and running GC") @@ -81,18 +83,18 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin): pscur.execute(f"do_gc {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['snapshot_files_total'] == snapshot_files_total + 1 - assert row['snapshot_files_removed'] == 1 - assert row['snapshot_files_dropped'] == 0 + assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1 + assert row['snapshot_relfiles_removed'] == 1 + assert row['snapshot_relfiles_dropped'] == 0 # Run GC again, with no changes in the database. Should not remove anything. print("Run GC again, with nothing to do") pscur.execute(f"do_gc {timeline} 0") row = pscur.fetchone() print_gc_result(row); - assert row['snapshot_files_total'] == snapshot_files_total - assert row['snapshot_files_removed'] == 0 - assert row['snapshot_files_dropped'] == 0 + assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + assert row['snapshot_relfiles_removed'] == 0 + assert row['snapshot_relfiles_dropped'] == 0 # # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage @@ -105,11 +107,11 @@ def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin): print_gc_result(row); # Each relation fork is counted separately, hence 3. - assert row['snapshot_files_dropped'] == 3 + assert row['snapshot_relfiles_dropped'] == 3 # The catalog updates also create new snapshot files of the catalogs, which # are counted as 'removed' - assert row['snapshot_files_removed'] > 0 + assert row['snapshot_relfiles_removed'] > 0 # TODO: perhaps we should count catalog and user relations separately, # to make this kind of testing more robust