mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-04 03:52:56 +00:00
Major storage format rewrite
Major changes and new concepts: Simplify Repository to a value-store ------------------------------------ Move the responsibility of tracking relation metadata, like which relations exist and what are their sizes, from Repository to a new module, pgdatadir_mapping.rs. The interface to Repository is now a simple key-value PUT/GET operations. It's still not any old key-value store though. A Repository is still responsible from handling branching, and every GET operation comes with an LSN. Key --- The key to the Repository key-value store is a Key struct, which consists of a few integer fields. It's wide enough to store a full RelFileNode, fork and block number, and to distinguish those from metadata keys. See pgdatadir_mapping.rs for how relation blocks and metadata keys are mapped to the Key struct. Store arbitrary key-ranges in the layer files --------------------------------------------- The concept of a "segment" is gone. Each layer file can store an arbitrary range of Keys. TODO: - Deleting keys, to reclaim space. This isn't visible to Postgres, dropping or truncating a relation works as you would expect if you look at it from the compute node. If you drop a relation, for example, the relation is removed from the metadata entry, so that it appears to be gone. However, the layered repository implementation never reclaims the storage. - Tracking "logical database size", for disk space quotas. That ought to be reimplemented now in pgdatadir_mapping.rs, or perhaps in walingest.rs. - LSM compaction. The logic for checkpointing and creating image layers is very dumb. AFAIK the *read* code could deal with a full-fledged LSM tree now consisting of the delta and image layers. But there's no code to take a bunch of delta layers and compact them, and the heuristics for when to create image layers is pretty dumb. - The code to track the layers is inefficient. All layers are just stored in a vector, and whenever we need to find a layer, we do a linear search in it.
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1307,6 +1307,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"nix",
|
||||
|
||||
@@ -12,6 +12,7 @@ bytes = { version = "1.0.1", features = ['serde'] }
|
||||
byteorder = "1.4.3"
|
||||
futures = "0.3.13"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "3.0"
|
||||
|
||||
@@ -22,6 +22,7 @@ use tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::Timeline;
|
||||
use crate::DatadirTimelineImpl;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -29,9 +30,9 @@ use zenith_utils::lsn::Lsn;
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a, T> {
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<T>,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
pub lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
}
|
||||
@@ -43,14 +44,12 @@ pub struct Basebackup<'a, T> {
|
||||
// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
// to start the replication.
|
||||
impl<'a, T> Basebackup<'a, T>
|
||||
where T: Timeline,
|
||||
{
|
||||
impl<'a> Basebackup<'a> {
|
||||
pub fn new(
|
||||
write: &'a mut dyn Write,
|
||||
timeline: &'a Arc<T>,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
req_lsn: Option<Lsn>,
|
||||
) -> Result<Basebackup<'a, T>> {
|
||||
) -> Result<Basebackup<'a>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
@@ -66,7 +65,7 @@ where T: Timeline,
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
timeline.wait_lsn(req_lsn)?;
|
||||
timeline.tline.wait_lsn(req_lsn)?;
|
||||
|
||||
// If the requested point is the end of the timeline, we can
|
||||
// provide prev_lsn. (get_last_record_rlsn() might return it as
|
||||
@@ -117,20 +116,21 @@ where T: Timeline,
|
||||
}
|
||||
|
||||
// Gather non-relational files from object storage pages.
|
||||
for obj in self.timeline.list_nonrels(self.lsn)? {
|
||||
match obj {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
self.add_slru_segment(slru, segno)?;
|
||||
}
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
self.add_relmap_file(spcnode, dbnode)?;
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => {
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
_ => {}
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
|
||||
self.add_slru_segment(kind, segno)?;
|
||||
}
|
||||
}
|
||||
for (spcnode, dbnode) in self.timeline.list_relmap_files(self.lsn)? {
|
||||
self.add_relmap_file(spcnode, dbnode)?;
|
||||
}
|
||||
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file()?;
|
||||
@@ -143,27 +143,14 @@ where T: Timeline,
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let seg_size = self
|
||||
.timeline
|
||||
.get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?;
|
||||
|
||||
if seg_size == None {
|
||||
trace!(
|
||||
"SLRU segment {}/{:>04X} was truncated",
|
||||
slru.to_str(),
|
||||
segno
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let nblocks = seg_size.unwrap();
|
||||
let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
|
||||
|
||||
let mut slru_buf: Vec<u8> =
|
||||
Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img =
|
||||
self.timeline
|
||||
.get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
slru_buf.extend_from_slice(&img);
|
||||
@@ -182,11 +169,7 @@ where T: Timeline,
|
||||
// Along with them also send PG_VERSION for each database.
|
||||
//
|
||||
fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn(
|
||||
RelishTag::FileNodeMap { spcnode, dbnode },
|
||||
0,
|
||||
self.lsn,
|
||||
)?;
|
||||
let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
|
||||
let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
|
||||
@@ -222,9 +205,7 @@ where T: Timeline,
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
|
||||
let img = self.timeline.get_twophase_file(xid, self.lsn)?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -244,11 +225,11 @@ where T: Timeline,
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
|
||||
.get_checkpoint(self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
|
||||
.get_control_file(self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
@@ -269,7 +250,7 @@ where T: Timeline,
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
||||
if self.lsn == self.timeline.tline.get_ancestor_lsn() {
|
||||
write!(zenith_signal, "PREV LSN: none")?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
||||
|
||||
@@ -20,12 +20,14 @@ use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::{crashsafe_dir, logging};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{config::PageServerConf, repository::Repository};
|
||||
use crate::RepositoryImpl;
|
||||
use crate::{import_datadir, LOG_FILE_NAME};
|
||||
use crate::{repository::RepositoryTimeline, tenant_mgr};
|
||||
use crate::repository::Timeline;
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BranchInfo {
|
||||
@@ -40,10 +42,10 @@ pub struct BranchInfo {
|
||||
}
|
||||
|
||||
impl BranchInfo {
|
||||
pub fn from_path<R: Repository, P: AsRef<Path>>(
|
||||
path: P,
|
||||
pub fn from_path<R: Repository, T: AsRef<Path>>(
|
||||
path: T,
|
||||
repo: &R,
|
||||
include_non_incremental_logical_size: bool,
|
||||
_include_non_incremental_logical_size: bool,
|
||||
) -> Result<Self> {
|
||||
let path = path.as_ref();
|
||||
let name = path.file_name().unwrap().to_string_lossy().to_string();
|
||||
@@ -74,11 +76,17 @@ impl BranchInfo {
|
||||
|
||||
// non incremental size calculation can be heavy, so let it be optional
|
||||
// needed for tests to check size calculation
|
||||
//
|
||||
// FIXME
|
||||
/*
|
||||
let current_logical_size_non_incremental = include_non_incremental_logical_size
|
||||
.then(|| {
|
||||
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
|
||||
})
|
||||
.transpose()?;
|
||||
*/
|
||||
let current_logical_size_non_incremental = Some(0);
|
||||
let current_logical_size = 0;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
@@ -86,7 +94,7 @@ impl BranchInfo {
|
||||
latest_valid_lsn: timeline.get_last_record_lsn(),
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
current_logical_size: timeline.get_current_logical_size(),
|
||||
current_logical_size, // : timeline.get_current_logical_size(),
|
||||
current_logical_size_non_incremental,
|
||||
})
|
||||
}
|
||||
@@ -130,7 +138,7 @@ pub fn create_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
) -> Result<Arc<crate::layered_repository::LayeredRepository>> {
|
||||
) -> Result<Arc<RepositoryImpl>> {
|
||||
let repo_dir = conf.tenant_path(&tenantid);
|
||||
if repo_dir.exists() {
|
||||
bail!("repo for {} already exists", tenantid)
|
||||
@@ -152,19 +160,19 @@ pub fn create_repo(
|
||||
|
||||
crashsafe_dir::create_dir(&timelinedir)?;
|
||||
|
||||
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
|
||||
let repo = crate::layered_repository::LayeredRepository::new(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
);
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
|
||||
bootstrap_timeline(conf, tenantid, timeline_id, &repo)?;
|
||||
|
||||
Ok(repo)
|
||||
Ok(Arc::new(repo))
|
||||
}
|
||||
|
||||
// Returns checkpoint LSN from controlfile
|
||||
@@ -233,17 +241,16 @@ fn bootstrap_timeline<R: Repository>(
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
&*timeline,
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
let page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline);
|
||||
|
||||
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &page_tline, lsn)?;
|
||||
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
println!(
|
||||
"created initial timeline {} timeline.lsn {}",
|
||||
tli,
|
||||
timeline.get_last_record_lsn()
|
||||
page_tline.tline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
let data = tli.to_string();
|
||||
|
||||
@@ -26,8 +26,9 @@ use super::models::BranchCreateRequest;
|
||||
use super::models::StatusResponse;
|
||||
use super::models::TenantCreateRequest;
|
||||
use crate::branches::BranchInfo;
|
||||
use crate::repository::{Repository, RepositoryTimeline, Timeline};
|
||||
use crate::repository::RepositoryTimeline;
|
||||
use crate::repository::TimelineSyncState;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -11,14 +11,15 @@ use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use tracing::*;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
use crate::repository::Repository;
|
||||
use crate::walingest::WalIngest;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
@@ -27,45 +28,43 @@ use zenith_utils::lsn::Lsn;
|
||||
/// This is currently only used to import a cluster freshly created by initdb.
|
||||
/// The code that deals with the checkpoint would not work right if the
|
||||
/// cluster was not shut down cleanly.
|
||||
pub fn import_timeline_from_postgres_datadir<T: Timeline>(
|
||||
pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
||||
path: &Path,
|
||||
timeline: &T,
|
||||
tline: &DatadirTimeline<R>,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
let writer_box = timeline.writer();
|
||||
let writer = writer_box.as_ref();
|
||||
let mut writer = tline.begin_record(lsn);
|
||||
writer.init_empty()?;
|
||||
|
||||
// Scan 'global'
|
||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
||||
writer.put_dbdir_creation(pg_constants::GLOBALTABLESPACE_OID, 0)?;
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
Some("pg_control") => {
|
||||
pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
|
||||
pg_control = Some(import_control_file(&mut writer, &direntry.path())?);
|
||||
}
|
||||
Some("pg_filenode.map") => {
|
||||
import_relmap_file(
|
||||
&mut writer,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
},
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
)?,
|
||||
// Load any relation files into the page server (but only after the other files)
|
||||
_ => relfiles.push(direntry.path()),
|
||||
}
|
||||
}
|
||||
for relfile in relfiles {
|
||||
import_relfile(&mut writer, &relfile, pg_constants::GLOBALTABLESPACE_OID, 0)?;
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
@@ -79,54 +78,56 @@ pub fn import_timeline_from_postgres_datadir<T: Timeline>(
|
||||
|
||||
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
|
||||
|
||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode: dboid,
|
||||
},
|
||||
Some("PG_VERSION") => {
|
||||
writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_relmap_file(
|
||||
&mut writer,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?,
|
||||
_ => relfiles.push(direntry.path()),
|
||||
}
|
||||
}
|
||||
for relfile in relfiles {
|
||||
import_relfile(
|
||||
&mut writer,
|
||||
&relfile,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
|
||||
import_slru_file(&mut writer, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
import_slru_file(&mut writer, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
import_slru_file(&mut writer, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
import_twophase_file(&mut writer, xid, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
// We're done importing all the data files.
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
writer.finish()?;
|
||||
|
||||
// We expect the Postgres server to be shut down cleanly.
|
||||
let pg_control = pg_control.context("pg_control file not found")?;
|
||||
@@ -144,8 +145,7 @@ pub fn import_timeline_from_postgres_datadir<T: Timeline>(
|
||||
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
|
||||
import_wal(
|
||||
&path.join("pg_wal"),
|
||||
timeline,
|
||||
writer,
|
||||
tline,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
lsn,
|
||||
)?;
|
||||
@@ -154,10 +154,9 @@ pub fn import_timeline_from_postgres_datadir<T: Timeline>(
|
||||
}
|
||||
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
fn import_relfile<R: Repository>(
|
||||
timeline: &mut DatadirTimelineWriter<R>,
|
||||
path: &Path,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
@@ -174,19 +173,28 @@ fn import_relfile(
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let len = file.metadata().unwrap().len();
|
||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0);
|
||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
||||
|
||||
if segno != 0 {
|
||||
todo!();
|
||||
}
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
timeline.put_rel_creation(rel, nblocks as u32)?;
|
||||
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let rel = RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?;
|
||||
timeline.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -203,20 +211,37 @@ fn import_relfile(
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
ensure!(blknum == nblocks as u32);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// FIXME
|
||||
/// Import a "non-blocky" file into the repository
|
||||
///
|
||||
/// This is used for small files like the control file, twophase files etc. that
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
tag: RelishTag,
|
||||
fn import_relmap_file<R: Repository>(
|
||||
timeline: &mut DatadirTimelineWriter<R>,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing relmap file {}", path.display());
|
||||
|
||||
timeline.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn import_twophase_file<R: Repository>(
|
||||
timeline: &mut DatadirTimelineWriter<R>,
|
||||
xid: TransactionId,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
@@ -226,7 +251,7 @@ fn import_nonrel_file(
|
||||
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
timeline.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -235,9 +260,8 @@ fn import_nonrel_file(
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
fn import_control_file<R: Repository>(
|
||||
timeline: &mut DatadirTimelineWriter<R>,
|
||||
path: &Path,
|
||||
) -> Result<ControlFileData> {
|
||||
let mut file = File::open(path)?;
|
||||
@@ -248,17 +272,12 @@ fn import_control_file(
|
||||
trace!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
timeline.put_page_image(
|
||||
RelishTag::ControlFile,
|
||||
0,
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buffer[..]),
|
||||
)?;
|
||||
timeline.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&buffer)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
|
||||
timeline.put_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
}
|
||||
@@ -266,30 +285,31 @@ fn import_control_file(
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
fn import_slru_file<R: Repository>(
|
||||
timeline: &mut DatadirTimelineWriter<R>,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
trace!("importing slru file {}", path.display());
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
trace!("importing slru file {}", path.display());
|
||||
let len = file.metadata().unwrap().len();
|
||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
|
||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
||||
|
||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
|
||||
|
||||
timeline.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
timeline.put_page_image(
|
||||
RelishTag::Slru { slru, segno },
|
||||
rpageno,
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buf),
|
||||
)?;
|
||||
timeline.put_slru_page_image(slru, segno, rpageno, Bytes::copy_from_slice(&buf))?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -305,19 +325,17 @@ fn import_slru_file(
|
||||
},
|
||||
};
|
||||
rpageno += 1;
|
||||
|
||||
// TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages
|
||||
}
|
||||
ensure!(rpageno == nblocks as u32);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal<T: Timeline>(
|
||||
fn import_wal<R: Repository>(
|
||||
walpath: &Path,
|
||||
timeline: &T,
|
||||
writer: &dyn TimelineWriter,
|
||||
tline: &DatadirTimeline<R>,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
) -> Result<()> {
|
||||
@@ -327,7 +345,7 @@ fn import_wal<T: Timeline>(
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let mut walingest = WalIngest::new(timeline, startpoint)?;
|
||||
let mut walingest = WalIngest::new(tline, startpoint)?;
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
@@ -360,7 +378,7 @@ fn import_wal<T: Timeline>(
|
||||
let mut nrecords = 0;
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest.ingest_record(timeline, writer, recdata, lsn)?;
|
||||
walingest.ingest_record(tline, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,5 @@
|
||||
//!
|
||||
//! A DeltaLayer represents a collection of WAL records or page images in a range of
|
||||
//! LSNs, for one segment. It is stored on a file on disk.
|
||||
//! LSNs, and in a range of Keys. It is stored on a file on disk.
|
||||
//!
|
||||
//! Usually a delta layer only contains differences - in the form of WAL records against
|
||||
//! a base LSN. However, if a segment is newly created, by creating a new relation or
|
||||
@@ -11,56 +10,53 @@
|
||||
//! can happen when you create a new branch in the middle of a delta layer, and the WAL
|
||||
//! records on the new branch are put in a new delta layer.
|
||||
//!
|
||||
//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters
|
||||
//! When a delta file needs to be accessed, we slurp the 'index' metadata
|
||||
//! into memory, into the DeltaLayerInner struct. See load() and unload() functions.
|
||||
//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN.
|
||||
//! The byte ranges in the metadata can be used to find the page/WAL record in
|
||||
//! PAGE_VERSIONS_CHAPTER.
|
||||
//! To access a particular value, we search `index` for the given key.
|
||||
//! The byte offset in the index can be used to find the value in
|
||||
//! VALUES_CHAPTER.
|
||||
//!
|
||||
//! On disk, the delta files are stored in timelines/<timelineid> directory.
|
||||
//! Currently, there are no subdirectories, and each delta file is named like this:
|
||||
//!
|
||||
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! 1663_13990_2609_0_5_000000000169C348_000000000169C349
|
||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
//!
|
||||
//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
|
||||
//! So the above example would become:
|
||||
//!
|
||||
//! 1663_13990_2609_0_5_000000000169C348_000000000169C349_DROPPED
|
||||
//!
|
||||
//! The end LSN indicates when it was dropped in that case, we don't store it in the
|
||||
//! file contents in any way.
|
||||
//!
|
||||
//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two
|
||||
//! parts: the page versions and the segment sizes. They are stored as separate chapters.
|
||||
//! A delta file is constructed using the 'bookfile' crate. Each file consists of three
|
||||
//! parts: the 'index', the values, and a short summary header. They are stored as
|
||||
//! separate chapters.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
|
||||
RELISH_SEG_SIZE,
|
||||
Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::layered_repository::utils;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::io::BufWriter;
|
||||
use std::io::Write;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter};
|
||||
use bookfile::{Book, BookWriter, ChapterWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -68,27 +64,23 @@ use zenith_utils::lsn::Lsn;
|
||||
// Magic constant to identify a Zenith delta file
|
||||
pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
|
||||
|
||||
/// Mapping from (block #, lsn) -> page/WAL record
|
||||
/// byte ranges in PAGE_VERSIONS_CHAPTER
|
||||
static PAGE_VERSION_METAS_CHAPTER: u64 = 1;
|
||||
/// Mapping from (key, lsn) -> page/WAL record
|
||||
/// byte ranges in VALUES_CHAPTER
|
||||
static INDEX_CHAPTER: u64 = 1;
|
||||
|
||||
/// Page/WAL bytes - cannot be interpreted
|
||||
/// without PAGE_VERSION_METAS_CHAPTER
|
||||
static PAGE_VERSIONS_CHAPTER: u64 = 2;
|
||||
static SEG_SIZES_CHAPTER: u64 = 3;
|
||||
/// without the page versions from the INDEX_CHAPTER
|
||||
static VALUES_CHAPTER: u64 = 2;
|
||||
|
||||
/// Contains the [`Summary`] struct
|
||||
static SUMMARY_CHAPTER: u64 = 4;
|
||||
static SUMMARY_CHAPTER: u64 = 3;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl From<&DeltaLayer> for Summary {
|
||||
@@ -96,33 +88,17 @@ impl From<&DeltaLayer> for Summary {
|
||||
Self {
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
seg: layer.seg,
|
||||
|
||||
start_lsn: layer.start_lsn,
|
||||
end_lsn: layer.end_lsn,
|
||||
|
||||
dropped: layer.dropped,
|
||||
key_range: layer.key_range.clone(),
|
||||
lsn_range: layer.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct BlobRange {
|
||||
offset: u64,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
let mut buf = vec![0u8; range.size];
|
||||
reader.read_exact_at(&mut buf, range.offset)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
/// file, in the LayerMap. If a layer is in "loaded" state, we have a
|
||||
/// copy of the file in memory, in 'inner'. Otherwise the struct is
|
||||
/// copy of the index in memory, in 'inner'. Otherwise the struct is
|
||||
/// just a placeholder for a file that exists on disk, and it needs to
|
||||
/// be loaded before using it in queries.
|
||||
///
|
||||
@@ -131,47 +107,24 @@ pub struct DeltaLayer {
|
||||
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub seg: SegmentTag,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
//
|
||||
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
|
||||
// start is inclusive, and end is exclusive.
|
||||
//
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
|
||||
inner: Mutex<DeltaLayerInner>,
|
||||
inner: RwLock<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
pub struct DeltaLayerInner {
|
||||
/// If false, the 'page_version_metas' and 'seg_sizes' have not been
|
||||
/// loaded into memory yet.
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are kept here.
|
||||
/// Indexed by block number and LSN. The value is an offset into the
|
||||
/// chapter where the page version is stored.
|
||||
///
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
|
||||
|
||||
/// `seg_sizes` tracks the size of the segment at different points in time.
|
||||
seg_sizes: VecMap<Lsn, SegmentBlk>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
|
||||
// Scan the VecMap backwards, starting from the given entry.
|
||||
let slice = self
|
||||
.seg_sizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
} else {
|
||||
bail!("could not find seg size in delta layer")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -183,40 +136,31 @@ impl Layer for DeltaLayer {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
self.dropped
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
self.end_lsn
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
lsn_floor: Lsn,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
|
||||
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
assert!(self.key_range.contains(&reconstruct_state.key));
|
||||
|
||||
match &reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if &self.lsn_range.end <= cached_lsn => {
|
||||
reconstruct_state.lsn = *cached_lsn;
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -224,91 +168,74 @@ impl Layer for DeltaLayer {
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load()?;
|
||||
let page_version_reader = inner
|
||||
let values_reader = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
// Scan the metadata VecMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let iter = inner
|
||||
.page_version_metas
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
match &reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if pv_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&reconstruct_state.key) {
|
||||
let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
reconstruct_state.lsn = *cached_lsn;
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
// Found a page image, return it
|
||||
reconstruct_data.page_img = Some((*pv_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find any records for this, check if the request is beyond EOF
|
||||
if need_image
|
||||
&& reconstruct_data.records.is_empty()
|
||||
&& self.seg.rel.is_blocky()
|
||||
&& blknum >= inner.get_seg_size(lsn)?
|
||||
{
|
||||
return Ok(PageReconstructResult::Missing(self.start_lsn));
|
||||
}
|
||||
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
reconstruct_state.lsn = Lsn(self.lsn_range.start.0 - 1);
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
// Return a set of all distinct Keys present in this layer
|
||||
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()> {
|
||||
let inner = self.load()?;
|
||||
inner.get_seg_size(lsn)
|
||||
|
||||
keys.extend(inner.index.keys().filter(|x| key_range.contains(x)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
// Is the requested LSN after the rel was dropped?
|
||||
if self.dropped && lsn >= self.end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
|
||||
let inner = self.load().unwrap();
|
||||
|
||||
// Otherwise, it exists.
|
||||
Ok(true)
|
||||
let mut pairs: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
||||
pairs.sort_by_key(|x| x.0);
|
||||
|
||||
match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(err) => Box::new(std::iter::once(Err(err)))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -316,14 +243,14 @@ impl Layer for DeltaLayer {
|
||||
/// it will need to be loaded back.
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.seg_sizes = VecMap::default();
|
||||
inner.loaded = false;
|
||||
if let Ok(mut inner) = self.inner.try_write() {
|
||||
inner.index = HashMap::default();
|
||||
inner.loaded = false;
|
||||
|
||||
// Note: we keep the Book open. Is that a good idea? The virtual file
|
||||
// machinery has its own rules for closing the file descriptor if it's not
|
||||
// needed, but the Book struct uses up some memory, too.
|
||||
// Note: we keep the Book open. Is that a good idea? The virtual file
|
||||
// machinery has its own rules for closing the file descriptor if it's not
|
||||
// needed, but the Book struct uses up some memory, too.
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -345,45 +272,52 @@ impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} seg {} {}-{} ----",
|
||||
self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
println!("--- seg sizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.seg_sizes.as_slice() {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
|
||||
let path = self.path();
|
||||
let file = std::fs::File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
let chapter = book.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let mut desc = String::new();
|
||||
let mut values: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
||||
values.sort_by_key(|k| k.0);
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
for (key, versions) in values {
|
||||
for (lsn, off) in versions.as_slice() {
|
||||
let mut desc = String::new();
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
blob_range.size,
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
let buf = utils::read_blob_from_chapter(&chapter, *off)?;
|
||||
let val = Value::des(&buf);
|
||||
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -408,61 +342,61 @@ impl DeltaLayer {
|
||||
///
|
||||
/// Load the contents of the file into memory
|
||||
///
|
||||
fn load(&self) -> Result<MutexGuard<DeltaLayerInner>> {
|
||||
// quick exit if already loaded
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
fn load(&self) -> Result<RwLockReadGuard<DeltaLayerInner>> {
|
||||
loop {
|
||||
// quick exit if already loaded
|
||||
{
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.book.is_none() {
|
||||
let file = VirtualFile::open(&path)?;
|
||||
inner.book = Some(Book::new(file)?);
|
||||
}
|
||||
let book = inner.book.as_ref().unwrap();
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let actual_summary = Summary::des(&chapter)?;
|
||||
|
||||
let expected_summary = Summary::from(self);
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
// need to upgrade to write lock
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.book.is_none() {
|
||||
let file = VirtualFile::open(&path)?;
|
||||
inner.book = Some(Book::new(file)?);
|
||||
}
|
||||
let book = inner.book.as_ref().unwrap();
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let actual_summary = Summary::des(&chapter)?;
|
||||
|
||||
let expected_summary = Summary::from(self);
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
|
||||
}
|
||||
}
|
||||
PathOrConf::Path(path) => {
|
||||
let actual_filename = Path::new(path.file_name().unwrap());
|
||||
let expected_filename = self.filename();
|
||||
|
||||
if actual_filename != expected_filename {
|
||||
println!(
|
||||
"warning: filename does not match what is expected from in-file summary"
|
||||
);
|
||||
println!("actual: {:?}", actual_filename);
|
||||
println!("expected: {:?}", expected_filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(INDEX_CHAPTER)?;
|
||||
let index = HashMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
inner.index = index;
|
||||
inner.loaded = true;
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
|
||||
let page_version_metas = VecMap::des(&chapter)?;
|
||||
|
||||
let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?;
|
||||
let seg_sizes = VecMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
inner.page_version_metas = page_version_metas;
|
||||
inner.seg_sizes = seg_sizes;
|
||||
inner.loaded = true;
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
@@ -476,15 +410,12 @@ impl DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
start_lsn: filename.start_lsn,
|
||||
end_lsn: filename.end_lsn,
|
||||
dropped: filename.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn_range: filename.lsn_range.clone(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
index: HashMap::default(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -494,7 +425,7 @@ impl DeltaLayer {
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
F: FileExt,
|
||||
{
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
@@ -503,25 +434,20 @@ impl DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
seg: summary.seg,
|
||||
start_lsn: summary.start_lsn,
|
||||
end_lsn: summary.end_lsn,
|
||||
dropped: summary.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
key_range: summary.key_range,
|
||||
lsn_range: summary.lsn_range,
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
index: HashMap::default(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -542,24 +468,24 @@ impl DeltaLayer {
|
||||
///
|
||||
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_version` for every page
|
||||
/// 2. Write the contents by calling `put_value` for every page
|
||||
/// version to store in the layer.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
|
||||
page_version_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
pv_offset: u64,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
values_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
end_offset: u64,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
@@ -570,94 +496,85 @@ impl DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
// Create the file
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&DeltaFileName {
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
},
|
||||
);
|
||||
|
||||
let path = conf
|
||||
.timeline_path(&timelineid, &tenantid)
|
||||
.join(format!("{}-XXX__{:016X}-{:016X}.temp",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end)));
|
||||
info!("temp deltalayer path {}", path.display());
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
// Open the page-versions chapter for writing. The calls to
|
||||
// `put_page_version` will use this to write the contents.
|
||||
let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER);
|
||||
// `put_value` will use this to write the contents.
|
||||
let values_writer = book.new_chapter(VALUES_CHAPTER);
|
||||
|
||||
Ok(DeltaLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
page_version_writer,
|
||||
page_version_metas: VecMap::default(),
|
||||
pv_offset: 0,
|
||||
key_start,
|
||||
lsn_range,
|
||||
index: HashMap::new(),
|
||||
values_writer,
|
||||
end_offset: 0,
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a page version to the file.
|
||||
/// Append a key-value pair to the file.
|
||||
///
|
||||
/// 'buf' is a serialized PageVersion.
|
||||
/// The page versions must be appended in blknum, lsn order.
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> {
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
//info!("DELTA: key {} at {} on {}", key, lsn, self.path.display());
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
// Remember the offset and size metadata. The metadata is written
|
||||
// to a separate chapter, in `finish`.
|
||||
let blob_range = BlobRange {
|
||||
offset: self.pv_offset,
|
||||
size: buf.len(),
|
||||
};
|
||||
self.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
|
||||
// write the page version
|
||||
self.page_version_writer.write_all(buf)?;
|
||||
self.pv_offset += buf.len() as u64;
|
||||
let off = self.end_offset;
|
||||
let len = utils::write_blob(&mut self.values_writer, &Value::ser(&val)?)?;
|
||||
self.end_offset += len;
|
||||
let vec_map = self.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
bail!(
|
||||
"Value for {} at {} already exists in delta layer being built",
|
||||
key,
|
||||
lsn
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.end_offset
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
/// 'seg_sizes' is a list of size changes to store with the actual data.
|
||||
///
|
||||
pub fn finish(self, seg_sizes: VecMap<Lsn, SegmentBlk>) -> Result<DeltaLayer> {
|
||||
// Close the page-versions chapter
|
||||
let book = self.page_version_writer.close()?;
|
||||
pub fn finish(self, key_end: Key) -> Result<DeltaLayer> {
|
||||
// Close the values chapter
|
||||
let book = self.values_writer.close()?;
|
||||
|
||||
// Write out page versions metadata
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = VecMap::ser(&self.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
if self.seg.rel.is_blocky() {
|
||||
assert!(!seg_sizes.is_empty());
|
||||
}
|
||||
|
||||
// and seg_sizes to separate chapter
|
||||
let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER);
|
||||
let buf = VecMap::ser(&seg_sizes)?;
|
||||
// Write out the index
|
||||
let mut chapter = book.new_chapter(INDEX_CHAPTER);
|
||||
let buf = HashMap::ser(&self.index)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
@@ -665,12 +582,8 @@ impl DeltaLayerWriter {
|
||||
let summary = Summary {
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
|
||||
dropped: self.dropped,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
@@ -685,20 +598,111 @@ impl DeltaLayerWriter {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
index: HashMap::new(),
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
}),
|
||||
};
|
||||
|
||||
trace!("created delta layer {}", &layer.path().display());
|
||||
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, final_path)?;
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
pub fn abort(self) {
|
||||
match self.values_writer.close() {
|
||||
Ok(book) => {
|
||||
if let Err(err) = book.close() {
|
||||
error!("error while closing delta layer file: {}", err);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("error while closing chapter writer: {}", err);
|
||||
}
|
||||
}
|
||||
if let Err(err) = std::fs::remove_file(self.path) {
|
||||
error!("error removing unfinished delta layer file: {}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct DeltaValueIter<'a> {
|
||||
all_offsets: Vec<(Key, Lsn, u64)>,
|
||||
next_idx: usize,
|
||||
|
||||
inner: RwLockReadGuard<'a, DeltaLayerInner>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for DeltaValueIter<'a> {
|
||||
type Item = Result<(Key, Lsn, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_res().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
impl<'a> DeltaValueIter<'a> {
|
||||
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
|
||||
|
||||
let mut index: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
|
||||
index.sort_by_key(|x| x.0);
|
||||
|
||||
let mut all_offsets: Vec<(Key, Lsn, u64)> = Vec::new();
|
||||
for (key, vec_map) in index.iter() {
|
||||
for (lsn, off) in vec_map.as_slice().iter() {
|
||||
all_offsets.push((**key, *lsn, *off));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(DeltaValueIter {
|
||||
all_offsets,
|
||||
inner,
|
||||
next_idx: 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.next_idx < self.all_offsets.len() {
|
||||
let (key, lsn, off) = self.all_offsets[self.next_idx];
|
||||
|
||||
let values_reader = self.inner
|
||||
.book
|
||||
.as_ref()
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
let val = Value::des(&utils::read_blob_from_chapter(&values_reader, off)?)?;
|
||||
|
||||
self.next_idx += 1;
|
||||
Ok(Some((key, lsn, val)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,29 +2,52 @@
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::storage_layer::SegmentTag;
|
||||
use crate::relish::*;
|
||||
use crate::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
pub seg: SegmentTag,
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
pub dropped: bool,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl PartialOrd for DeltaFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for DeltaFileName {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp;
|
||||
|
||||
cmp = self.key_range.start.cmp(&other.key_range.start);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.key_range.end.cmp(&other.key_range.end);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.lsn_range.start.cmp(&other.lsn_range.start);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||
|
||||
cmp
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the filename of a DeltaLayer
|
||||
///
|
||||
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
|
||||
///
|
||||
/// or if it was dropped:
|
||||
///
|
||||
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
|
||||
/// <seg start>-<seg end>-<LSN start>-<LSN end>
|
||||
///
|
||||
impl DeltaFileName {
|
||||
///
|
||||
@@ -32,234 +55,124 @@ impl DeltaFileName {
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
let mut lsn_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_start_str = lsn_parts.next()?;
|
||||
let lsn_end_str = lsn_parts.next()?;
|
||||
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let segno = parts.next()?.parse::<u32>().ok()?;
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
let seg = SegmentTag { rel, segno };
|
||||
let start_lsn = Lsn::from_hex(lsn_start_str).ok()?;
|
||||
let end_lsn = Lsn::from_hex(lsn_end_str).ok()?;
|
||||
|
||||
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
let mut dropped = false;
|
||||
if let Some(suffix) = parts.next() {
|
||||
if suffix == "DROPPED" {
|
||||
dropped = true;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
if parts.next().is_some() {
|
||||
if start_lsn >= end_lsn {
|
||||
return None;
|
||||
// or panic?
|
||||
}
|
||||
|
||||
if key_start >= key_end {
|
||||
return None;
|
||||
// or panic?
|
||||
}
|
||||
|
||||
Some(DeltaFileName {
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
key_range: key_start..key_end,
|
||||
lsn_range: start_lsn..end_lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{}_{}_{:016X}_{:016X}{}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
u64::from(self.start_lsn),
|
||||
u64::from(self.end_lsn),
|
||||
if self.dropped { "_DROPPED" } else { "" }
|
||||
"{}-{}__{:016X}-{:016X}",
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
u64::from(self.lsn_range.start),
|
||||
u64::from(self.lsn_range.end),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct ImageFileName {
|
||||
pub seg: SegmentTag,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PartialOrd for ImageFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ImageFileName {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp;
|
||||
|
||||
cmp = self.key_range.start.cmp(&other.key_range.start);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.key_range.end.cmp(&other.key_range.end);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.lsn.cmp(&other.lsn);
|
||||
|
||||
cmp
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Represents the filename of an ImageLayer
|
||||
///
|
||||
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<LSN>
|
||||
///
|
||||
/// FIXME
|
||||
impl ImageFileName {
|
||||
///
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_str = parts.next()?;
|
||||
if parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let segno = parts.next()?.parse::<u32>().ok()?;
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
let seg = SegmentTag { rel, segno };
|
||||
let lsn = Lsn::from_hex(lsn_str).ok()?;
|
||||
|
||||
let lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
if parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(ImageFileName { seg, lsn })
|
||||
Some(ImageFileName {
|
||||
key_range: key_start..key_end,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{}_{}_{:016X}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
"{}-{}__{:016X}",
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
u64::from(self.lsn),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
//!
|
||||
//! Global registry of open layers.
|
||||
//!
|
||||
//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
|
||||
//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
|
||||
//! in-memory layers in the system, and know when we need to evict some to release
|
||||
//! memory.
|
||||
//!
|
||||
//! Each layer is assigned a unique ID when it's registered in the global registry.
|
||||
//! The ID can be used to relocate the layer later, without having to hold locks.
|
||||
//!
|
||||
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::inmemory_layer::InMemoryLayer;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
|
||||
RwLock::new(InMemoryLayers::default());
|
||||
}
|
||||
|
||||
// TODO these types can probably be smaller
|
||||
#[derive(PartialEq, Eq, Clone, Copy)]
|
||||
pub struct LayerId {
|
||||
index: usize,
|
||||
tag: u64, // to avoid ABA problem
|
||||
}
|
||||
|
||||
enum SlotData {
|
||||
Occupied(Arc<InMemoryLayer>),
|
||||
/// Vacant slots form a linked list, the value is the index
|
||||
/// of the next vacant slot in the list.
|
||||
Vacant(Option<usize>),
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
tag: u64,
|
||||
data: SlotData,
|
||||
usage_count: AtomicU8, // for clock algorithm
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct InMemoryLayers {
|
||||
slots: Vec<Slot>,
|
||||
num_occupied: usize,
|
||||
|
||||
// Head of free-slot list.
|
||||
next_empty_slot_idx: Option<usize>,
|
||||
}
|
||||
|
||||
impl InMemoryLayers {
|
||||
pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
let slot_idx = match self.next_empty_slot_idx {
|
||||
Some(slot_idx) => slot_idx,
|
||||
None => {
|
||||
let idx = self.slots.len();
|
||||
self.slots.push(Slot {
|
||||
tag: 0,
|
||||
data: SlotData::Vacant(None),
|
||||
usage_count: AtomicU8::new(0),
|
||||
});
|
||||
idx
|
||||
}
|
||||
};
|
||||
let slots_len = self.slots.len();
|
||||
|
||||
let slot = &mut self.slots[slot_idx];
|
||||
|
||||
match slot.data {
|
||||
SlotData::Occupied(_) => {
|
||||
panic!("an occupied slot was in the free list");
|
||||
}
|
||||
SlotData::Vacant(next_empty_slot_idx) => {
|
||||
self.next_empty_slot_idx = next_empty_slot_idx;
|
||||
}
|
||||
}
|
||||
|
||||
slot.data = SlotData::Occupied(layer);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
self.num_occupied += 1;
|
||||
assert!(self.num_occupied <= slots_len);
|
||||
|
||||
LayerId {
|
||||
index: slot_idx,
|
||||
tag: slot.tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
|
||||
let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
|
||||
if slot.tag != layer_id.tag {
|
||||
return None;
|
||||
}
|
||||
|
||||
if let SlotData::Occupied(layer) = &slot.data {
|
||||
let _ = slot.usage_count.fetch_update(
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
|old_usage_count| {
|
||||
if old_usage_count < MAX_USAGE_COUNT {
|
||||
Some(old_usage_count + 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
Some(Arc::clone(layer))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// TODO this won't be a public API in the future
|
||||
pub fn remove(&mut self, layer_id: &LayerId) {
|
||||
let slot = &mut self.slots[layer_id.index];
|
||||
|
||||
if slot.tag != layer_id.tag {
|
||||
return;
|
||||
}
|
||||
|
||||
match &slot.data {
|
||||
SlotData::Occupied(_layer) => {
|
||||
// TODO evict the layer
|
||||
}
|
||||
SlotData::Vacant(_) => unimplemented!(),
|
||||
}
|
||||
|
||||
slot.data = SlotData::Vacant(self.next_empty_slot_idx);
|
||||
self.next_empty_slot_idx = Some(layer_id.index);
|
||||
|
||||
assert!(self.num_occupied > 0);
|
||||
self.num_occupied -= 1;
|
||||
|
||||
slot.tag = slot.tag.wrapping_add(1);
|
||||
}
|
||||
}
|
||||
@@ -4,38 +4,34 @@
|
||||
//! On disk, the image files are stored in timelines/<timelineid> directory.
|
||||
//! Currently, there are no subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! Note that segno is
|
||||
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
|
||||
//! <key start>-<key end>__<LSN>
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! 1663_13990_2609_0_5_000000000169C348
|
||||
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
//!
|
||||
//! An image file is constructed using the 'bookfile' crate.
|
||||
//!
|
||||
//! Only metadata is loaded into memory by the load function.
|
||||
//! When images are needed, they are read directly from disk.
|
||||
//!
|
||||
//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
|
||||
//! All the images are required to be BLOCK_SIZE, which allows for random access.
|
||||
//!
|
||||
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag,
|
||||
Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::layered_repository::utils;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::convert::TryInto;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Range;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
|
||||
@@ -44,12 +40,16 @@ use bookfile::{Book, BookWriter, ChapterWriter};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Magic constant to identify a Zenith segment image file
|
||||
// Magic constant to identify a Zenith image layer file
|
||||
// FIXME: bump all magics
|
||||
pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
|
||||
|
||||
/// Mapping from (key, lsn) -> page/WAL record
|
||||
/// byte ranges in VALUES_CHAPTER
|
||||
static INDEX_CHAPTER: u64 = 1;
|
||||
|
||||
/// Contains each block in block # order
|
||||
const BLOCKY_IMAGES_CHAPTER: u64 = 1;
|
||||
const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
|
||||
const VALUES_CHAPTER: u64 = 2;
|
||||
|
||||
/// Contains the [`Summary`] struct
|
||||
const SUMMARY_CHAPTER: u64 = 3;
|
||||
@@ -58,7 +58,7 @@ const SUMMARY_CHAPTER: u64 = 3;
|
||||
struct Summary {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
key_range: Range<Key>,
|
||||
|
||||
lsn: Lsn,
|
||||
}
|
||||
@@ -68,19 +68,17 @@ impl From<&ImageLayer> for Summary {
|
||||
Self {
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
seg: layer.seg,
|
||||
key_range: layer.key_range.clone(),
|
||||
|
||||
lsn: layer.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const BLOCK_SIZE: usize = 8192;
|
||||
|
||||
///
|
||||
/// ImageLayer is the in-memory data structure associated with an on-disk image
|
||||
/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a
|
||||
/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'.
|
||||
/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'.
|
||||
/// Otherwise the struct is just a placeholder for a file that exists on disk,
|
||||
/// and it needs to be loaded before using it in queries.
|
||||
///
|
||||
@@ -88,7 +86,7 @@ pub struct ImageLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub seg: SegmentTag,
|
||||
pub key_range: Range<Key>,
|
||||
|
||||
// This entry contains an image of all pages as of this LSN
|
||||
pub lsn: Lsn,
|
||||
@@ -96,18 +94,15 @@ pub struct ImageLayer {
|
||||
inner: Mutex<ImageLayerInner>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum ImageType {
|
||||
Blocky { num_blocks: SegmentBlk },
|
||||
NonBlocky,
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If None, the 'image_type' has not been loaded into memory yet.
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
/// If None, the 'image_type' has not been loaded into memory yet. FIXME
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// Derived from filename and bookfile chapter metadata
|
||||
image_type: ImageType,
|
||||
/// offset of each value
|
||||
index: HashMap<Key, u64>,
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
@@ -123,98 +118,80 @@ impl Layer for ImageLayer {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
// End-bound is exclusive
|
||||
self.lsn + 1
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_page_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
assert!(lsn >= self.lsn);
|
||||
lsn_floor: Lsn,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
assert!(lsn_floor <= self.lsn);
|
||||
assert!(self.key_range.contains(&reconstruct_state.key));
|
||||
assert!(reconstruct_state.lsn >= self.lsn);
|
||||
|
||||
match reconstruct_data.page_img {
|
||||
match reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if self.lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
reconstruct_state.lsn = cached_lsn;
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let buf = match &inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => {
|
||||
// Check if the request is beyond EOF
|
||||
if blknum >= *num_blocks {
|
||||
return Ok(PageReconstructResult::Missing(lsn));
|
||||
}
|
||||
if let Some(offset) = inner.index.get(&reconstruct_state.key) {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
let offset = BLOCK_SIZE as u64 * blknum as u64;
|
||||
let blob = utils::read_blob_from_chapter(&chapter, *offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read value from data file {} at offset {}",
|
||||
self.filename().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
|
||||
chapter.read_exact_at(&mut buf, offset).with_context(|| {
|
||||
format!(
|
||||
"failed to read page from data file {} at offset {}",
|
||||
self.filename().display(),
|
||||
offset
|
||||
)
|
||||
})?;
|
||||
|
||||
buf
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
ensure!(blknum == 0);
|
||||
inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
|
||||
.into_vec()
|
||||
}
|
||||
};
|
||||
|
||||
reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf)));
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
|
||||
/// Get size of the segment
|
||||
fn get_seg_size(&self, _lsn: Lsn) -> Result<SegmentBlk> {
|
||||
let inner = self.load()?;
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => Ok(num_blocks),
|
||||
ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
reconstruct_state.lsn = self.lsn;
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
reconstruct_state.lsn = self.lsn;
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
|
||||
Ok(true)
|
||||
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()> {
|
||||
let inner = self.load()?;
|
||||
|
||||
let index = &inner.index;
|
||||
|
||||
keys.extend(index.keys().filter(|x| key_range.contains(x)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
fn unload(&self) -> Result<()> {
|
||||
// TODO: unload 'segs'. Or even better, don't hold it in memory but
|
||||
// access it directly from the file (using the buffer cache)
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.index = HashMap::default();
|
||||
inner.loaded = false;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -235,22 +212,17 @@ impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} seg {} at {} ----",
|
||||
self.tenantid, self.timelineid, self.seg, self.lsn
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} ----",
|
||||
self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
|
||||
);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
|
||||
ImageType::NonBlocky => {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
println!("non-blocky ({} bytes)", chapter.len());
|
||||
}
|
||||
let mut index_vec: Vec<(&Key, &u64)> = inner.index.iter().collect();
|
||||
index_vec.sort_by_key(|x| x.1);
|
||||
|
||||
for (key, offset) in index_vec {
|
||||
println!("key: {} offset {}", key, offset);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -279,19 +251,21 @@ impl ImageLayer {
|
||||
// quick exit if already loaded
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
if inner.book.is_some() {
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = self.path();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
|
||||
let book = Book::new(file).with_context(|| {
|
||||
format!(
|
||||
"Failed to open virtual file '{}' as a bookfile",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.book.is_none() {
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
inner.book = Some(Book::new(file).with_context(|| {
|
||||
format!("Failed to open file '{}' as a bookfile", path.display())
|
||||
})?);
|
||||
}
|
||||
let book = inner.book.as_ref().unwrap();
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -318,23 +292,13 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
let images_len = chapter.len();
|
||||
ensure!(images_len % BLOCK_SIZE as u64 == 0);
|
||||
let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?;
|
||||
ImageType::Blocky { num_blocks }
|
||||
} else {
|
||||
let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
let chapter = book.read_chapter(INDEX_CHAPTER)?;
|
||||
let index = HashMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
info!("loaded from {}", &path.display());
|
||||
|
||||
*inner = ImageLayerInner {
|
||||
book: Some(book),
|
||||
image_type,
|
||||
};
|
||||
inner.index = index;
|
||||
inner.loaded = true;
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
@@ -350,11 +314,12 @@ impl ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn: filename.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
index: HashMap::new(),
|
||||
loaded: false,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -373,18 +338,19 @@ impl ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
seg: summary.seg,
|
||||
key_range: summary.key_range,
|
||||
lsn: summary.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
index: HashMap::new(),
|
||||
loaded: false,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
@@ -413,15 +379,18 @@ impl ImageLayer {
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
num_blocks: SegmentBlk,
|
||||
values_writer: Option<ChapterWriter<BufWriter<VirtualFile>>>,
|
||||
end_offset: u64,
|
||||
|
||||
page_image_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
num_blocks_written: SegmentBlk,
|
||||
index: HashMap<Key, u64>,
|
||||
|
||||
finished: bool,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
@@ -429,9 +398,8 @@ impl ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
num_blocks: SegmentBlk,
|
||||
) -> Result<ImageLayerWriter> {
|
||||
// Create the file
|
||||
//
|
||||
@@ -441,70 +409,74 @@ impl ImageLayerWriter {
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&ImageFileName { seg, lsn },
|
||||
&ImageFileName {
|
||||
key_range: key_range.clone(),
|
||||
lsn,
|
||||
},
|
||||
);
|
||||
info!("new image layer {}", path.display());
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
// Open the page-images chapter for writing. The calls to
|
||||
// `put_page_image` will use this to write the contents.
|
||||
let chapter = if seg.rel.is_blocky() {
|
||||
book.new_chapter(BLOCKY_IMAGES_CHAPTER)
|
||||
} else {
|
||||
assert_eq!(num_blocks, 1);
|
||||
book.new_chapter(NONBLOCKY_IMAGE_CHAPTER)
|
||||
};
|
||||
// `put_image` will use this to write the contents.
|
||||
let chapter = book.new_chapter(VALUES_CHAPTER);
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
key_range: key_range.clone(),
|
||||
lsn,
|
||||
num_blocks,
|
||||
page_image_writer: chapter,
|
||||
num_blocks_written: 0,
|
||||
values_writer: Some(chapter),
|
||||
index: HashMap::new(),
|
||||
end_offset: 0,
|
||||
finished: false,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
}
|
||||
|
||||
///
|
||||
/// Write next page image to the file.
|
||||
/// Write next value to the file.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> {
|
||||
assert!(self.num_blocks_written < self.num_blocks);
|
||||
if self.seg.rel.is_blocky() {
|
||||
assert_eq!(block_bytes.len(), BLOCK_SIZE);
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
let off = self.end_offset;
|
||||
|
||||
if let Some(writer) = &mut self.values_writer {
|
||||
let len = utils::write_blob(writer, img)?;
|
||||
self.end_offset += len;
|
||||
|
||||
let old = self.index.insert(key, off);
|
||||
assert!(old.is_none());
|
||||
} else {
|
||||
panic!()
|
||||
}
|
||||
self.page_image_writer.write_all(block_bytes)?;
|
||||
self.num_blocks_written += 1;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<ImageLayer> {
|
||||
// Check that the `put_page_image' was called for every block.
|
||||
assert!(self.num_blocks_written == self.num_blocks);
|
||||
pub fn finish(&mut self) -> Result<ImageLayer> {
|
||||
// Close the values chapter
|
||||
let book = self.values_writer.take().unwrap().close()?;
|
||||
|
||||
// Close the page-images chapter
|
||||
let book = self.page_image_writer.close()?;
|
||||
// Write out the index
|
||||
let mut chapter = book.new_chapter(INDEX_CHAPTER);
|
||||
let buf = HashMap::ser(&self.index)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// Write out the summary chapter
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
ImageType::Blocky {
|
||||
num_blocks: self.num_blocks,
|
||||
}
|
||||
} else {
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
@@ -520,15 +492,31 @@ impl ImageLayerWriter {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
timelineid: self.timelineid,
|
||||
tenantid: self.tenantid,
|
||||
seg: self.seg,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type,
|
||||
loaded: false,
|
||||
index: HashMap::new(),
|
||||
}),
|
||||
};
|
||||
trace!("created image layer {}", layer.path().display());
|
||||
|
||||
self.finished = true;
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(page_image_writer) = self.values_writer.take() {
|
||||
if let Ok(book) = page_image_writer.close() {
|
||||
let _ = book.close();
|
||||
}
|
||||
}
|
||||
if !self.finished {
|
||||
let _ = fs::remove_file(&self.path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,30 +1,24 @@
|
||||
//! An in-memory layer stores recently received PageVersions.
|
||||
//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
|
||||
//! and layers can be spilled to disk into ephemeral files.
|
||||
//! An in-memory layer stores recently received key-value pairs.
|
||||
//!
|
||||
//! And there's another BTreeMap to track the size of the relation.
|
||||
//! The "in-memory" part of the name is a bit misleading: the actual page versions are
|
||||
//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
|
||||
//! its position in the file, is kept in memory, though.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
use crate::layered_repository::filename::DeltaFileName;
|
||||
use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
|
||||
RELISH_SEG_SIZE,
|
||||
Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::ZERO_PAGE;
|
||||
use crate::repository::ZenithWalRecord;
|
||||
use crate::layered_repository::utils;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use anyhow::Result;
|
||||
use log::*;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Seek;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
@@ -33,7 +27,6 @@ pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
@@ -42,7 +35,7 @@ pub struct InMemoryLayer {
|
||||
start_lsn: Lsn,
|
||||
|
||||
///
|
||||
/// LSN of the oldest page version stored in this layer.
|
||||
/// LSN of the oldest value stored in this layer.
|
||||
///
|
||||
/// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
|
||||
/// of a layer always matches the 'end_lsn' of its predecessor, even if there
|
||||
@@ -59,9 +52,6 @@ pub struct InMemoryLayer {
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
/// Predecessor layer might be needed?
|
||||
incremental: bool,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
@@ -69,98 +59,25 @@ pub struct InMemoryLayerInner {
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// If this relation was dropped, remember when that happened.
|
||||
/// The drop LSN is recorded in [`end_lsn`].
|
||||
dropped: bool,
|
||||
///
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
///
|
||||
index: HashMap<Key, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The PageVersion structs are stored in a serialized format in this file.
|
||||
/// Each serialized PageVersion is preceded by a 'u32' length field.
|
||||
/// 'page_versions' map stores offsets into this file.
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
|
||||
/// Metadata about all versions of all pages in the layer is kept
|
||||
/// here. Indexed by block number and LSN. The value is an offset
|
||||
/// into the ephemeral file where the page version is stored.
|
||||
page_versions: HashMap<SegmentBlk, VecMap<Lsn, u64>>,
|
||||
|
||||
///
|
||||
/// `seg_sizes` tracks the size of the segment at different points in time.
|
||||
///
|
||||
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
|
||||
/// so that determining the size never depends on the predecessor layer. For
|
||||
/// a non-blocky rel, 'seg_sizes' is not used and is always empty.
|
||||
///
|
||||
seg_sizes: VecMap<Lsn, SegmentBlk>,
|
||||
|
||||
///
|
||||
/// LSN of the newest page version stored in this layer.
|
||||
///
|
||||
/// The difference between 'end_lsn' and 'latest_lsn' is the same as between
|
||||
/// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'.
|
||||
///
|
||||
latest_lsn: Lsn,
|
||||
end_offset: u64,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let slice = self.seg_sizes.slice_range(..=lsn);
|
||||
|
||||
// We make sure there is always at least one entry
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
*entry
|
||||
} else {
|
||||
panic!("could not find seg size in in-memory layer");
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Read a page version from the ephemeral file.
|
||||
///
|
||||
fn read_pv(&self, off: u64) -> Result<PageVersion> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_pv_bytes(off, &mut buf)?;
|
||||
Ok(PageVersion::des(&buf)?)
|
||||
}
|
||||
|
||||
///
|
||||
/// Read a page version from the ephemeral file, as raw bytes, at
|
||||
/// the given offset. The bytes are read into 'buf', which is
|
||||
/// expanded if necessary. Returns the size of the page version.
|
||||
///
|
||||
fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
|
||||
// read length
|
||||
let mut lenbuf = [0u8; 4];
|
||||
self.file.read_exact_at(&mut lenbuf, off)?;
|
||||
let len = u32::from_ne_bytes(lenbuf) as usize;
|
||||
|
||||
if buf.len() < len {
|
||||
buf.resize(len, 0);
|
||||
}
|
||||
self.file.read_exact_at(&mut buf[0..len], off + 4)?;
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
fn write_pv(&mut self, pv: &PageVersion) -> Result<u64> {
|
||||
// remember starting position
|
||||
let pos = self.file.stream_position()?;
|
||||
|
||||
// make room for the 'length' field by writing zeros as a placeholder.
|
||||
self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
|
||||
|
||||
pv.ser_into(&mut self.file).unwrap();
|
||||
|
||||
// write the 'length' field.
|
||||
let len = self.file.stream_position()? - pos - 4;
|
||||
let lenbuf = u32::to_ne_bytes(len as u32);
|
||||
self.file.write_all_at(&lenbuf, pos)?;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
@@ -170,21 +87,12 @@ impl Layer for InMemoryLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = if let Some(drop_lsn) = inner.end_lsn {
|
||||
drop_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
};
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped: inner.dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
PathBuf::from(format!("inmem-{}", delta_filename))
|
||||
PathBuf::from(format!(
|
||||
"inmem-{:016X}-{:016X}",
|
||||
self.start_lsn.0, end_lsn.0
|
||||
))
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
@@ -195,132 +103,85 @@ impl Layer for InMemoryLayer {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
let end_lsn = if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
}
|
||||
};
|
||||
self.start_lsn..end_lsn
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.dropped
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_reconstruct_data(
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
lsn_floor: Lsn,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
assert!(lsn_floor <= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
{
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.page_versions.get(&blknum) {
|
||||
let slice = vec_map.slice_range(..=lsn);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&reconstruct_state.key) {
|
||||
let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = inner.read_pv(*pos)?;
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
reconstruct_data.page_img = Some((*entry_lsn, img));
|
||||
let value = Value::des(&utils::read_blob(&inner.file, *pos)?)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
|
||||
reconstruct_state.lsn = *entry_lsn;
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
if rec.will_init() {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find any records for this, check if the request is beyond EOF
|
||||
if need_image
|
||||
&& reconstruct_data.records.is_empty()
|
||||
&& self.seg.rel.is_blocky()
|
||||
&& blknum >= self.get_seg_size(lsn)?
|
||||
{
|
||||
return Ok(PageReconstructResult::Missing(self.start_lsn));
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know
|
||||
// caller know.
|
||||
if need_image {
|
||||
if self.incremental {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
reconstruct_state.lsn = Lsn(self.start_lsn.0 - 1);
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
|
||||
keys.extend(inner.index.keys().filter(|x| key_range.contains(x)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// If the segment created after requested LSN,
|
||||
// it doesn't exist in the layer. But we shouldn't
|
||||
// have requested it in the first place.
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if inner.dropped {
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
panic!("dropped in-memory layer with no end LSN");
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, it exists
|
||||
Ok(true)
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
/// Cannot unload anything in an in-memory layer, since there's no backing
|
||||
@@ -337,7 +198,8 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.incremental
|
||||
// in-memory layer is always considered incremental.
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
@@ -355,53 +217,39 @@ impl Layer for InMemoryLayer {
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
|
||||
"----- in-memory layer for tli {} LSNs {}-{} ----",
|
||||
self.timelineid,
|
||||
self.start_lsn,
|
||||
end_str,
|
||||
//inner.dropped,
|
||||
);
|
||||
|
||||
for (k, v) in inner.seg_sizes.as_slice() {
|
||||
println!("seg_sizes {}: {}", k, v);
|
||||
}
|
||||
// FIXME
|
||||
/*
|
||||
for (blknum, versions) in page_versions {
|
||||
for (lsn, off) in versions.as_slice() {
|
||||
let pv = inner.read_pv(*off);
|
||||
let pv_description = match pv {
|
||||
Ok(PageVersion::Page(_img)) => "page",
|
||||
Ok(PageVersion::Wal(_rec)) => "wal",
|
||||
Err(_err) => "INVALID",
|
||||
};
|
||||
|
||||
// List the blocks in order
|
||||
let mut page_versions: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> =
|
||||
inner.page_versions.iter().collect();
|
||||
page_versions.sort_by_key(|k| k.0);
|
||||
|
||||
for (blknum, versions) in page_versions {
|
||||
for (lsn, off) in versions.as_slice() {
|
||||
let pv = inner.read_pv(*off);
|
||||
let pv_description = match pv {
|
||||
Ok(PageVersion::Page(_img)) => "page",
|
||||
Ok(PageVersion::Wal(_rec)) => "wal",
|
||||
Err(_err) => "INVALID",
|
||||
};
|
||||
|
||||
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
|
||||
}
|
||||
}
|
||||
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A result of an inmemory layer data being written to disk.
|
||||
pub struct LayersOnDisk {
|
||||
pub delta_layers: Vec<DeltaLayer>,
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_lsn(&self) -> Lsn {
|
||||
self.oldest_lsn
|
||||
}
|
||||
|
||||
pub fn get_latest_lsn(&self) -> Lsn {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.latest_lsn
|
||||
}
|
||||
|
||||
///
|
||||
/// Create a new, empty, in-memory layer
|
||||
///
|
||||
@@ -409,268 +257,88 @@ impl InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
oldest_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
"initializing new empty InMemoryLayer for writing on timeline {} at {}",
|
||||
timelineid,
|
||||
start_lsn
|
||||
);
|
||||
|
||||
// The segment is initially empty, so initialize 'seg_sizes' with 0.
|
||||
let mut seg_sizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
seg_sizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
oldest_lsn,
|
||||
incremental: false,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
page_versions: HashMap::new(),
|
||||
seg_sizes,
|
||||
latest_lsn: oldest_lsn,
|
||||
end_offset: 0,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
blknum: SegmentBlk,
|
||||
rec: ZenithWalRecord,
|
||||
) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Page(img))
|
||||
}
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result<u32> {
|
||||
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
|
||||
trace!(
|
||||
"put_page_version blk {} of {} at {}/{}",
|
||||
blknum,
|
||||
self.seg.rel,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.assert_writeable();
|
||||
assert!(lsn >= inner.latest_lsn);
|
||||
inner.latest_lsn = lsn;
|
||||
|
||||
// Write the page version to the file, and remember its offset in 'page_versions'
|
||||
{
|
||||
let off = inner.write_pv(&pv)?;
|
||||
let vec_map = inner.page_versions.entry(blknum).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!(
|
||||
"Page version of rel {} blk {} at {} already exists",
|
||||
self.seg.rel, blknum, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Also update the relation size, if this extended the relation.
|
||||
if self.seg.rel.is_blocky() {
|
||||
let newsize = blknum + 1;
|
||||
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
trace!(
|
||||
"enlarging segment {} from {} to {} blocks at {}",
|
||||
self.seg,
|
||||
oldsize,
|
||||
newsize,
|
||||
lsn
|
||||
);
|
||||
|
||||
// If we are extending the relation by more than one page, initialize the "gap"
|
||||
// with zeros
|
||||
//
|
||||
// XXX: What if the caller initializes the gap with subsequent call with same LSN?
|
||||
// I don't think that can happen currently, but that is highly dependent on how
|
||||
// PostgreSQL writes its WAL records and there's no guarantee of it. If it does
|
||||
// happen, we would hit the "page version already exists" warning above on the
|
||||
// subsequent call to initialize the gap page.
|
||||
for gapblknum in oldsize..blknum {
|
||||
let zeropv = PageVersion::Page(ZERO_PAGE.clone());
|
||||
trace!(
|
||||
"filling gap blk {} with zeros for write of {}",
|
||||
gapblknum,
|
||||
blknum
|
||||
);
|
||||
|
||||
// Write the page version to the file, and remember its offset in
|
||||
// 'page_versions'
|
||||
{
|
||||
let off = inner.write_pv(&zeropv)?;
|
||||
let vec_map = inner.page_versions.entry(gapblknum).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
warn!(
|
||||
"Page version of seg {} blk {} at {} already exists",
|
||||
self.seg, gapblknum, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
return Ok(newsize - oldsize);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) {
|
||||
assert!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"put_truncation() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
let old_size = inner.get_seg_size(lsn);
|
||||
assert!(new_size < old_size);
|
||||
|
||||
let (old, _delta_size) = inner
|
||||
.seg_sizes
|
||||
.append_or_update_last(lsn, new_size)
|
||||
.unwrap();
|
||||
let off = inner.end_offset;
|
||||
let len = utils::write_blob(&mut inner.file, &Value::ser(&val)?)?;
|
||||
inner.end_offset += len;
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn drop_segment(&self, lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
assert!(inner.end_lsn.is_none());
|
||||
assert!(!inner.dropped);
|
||||
inner.dropped = true;
|
||||
assert!(self.start_lsn < lsn);
|
||||
inner.end_lsn = Some(lsn);
|
||||
|
||||
trace!("dropped segment {} at {}", self.seg, lsn);
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize a new InMemoryLayer for, by copying the state at the given
|
||||
/// point in time from given existing layer.
|
||||
///
|
||||
pub fn create_successor_layer(
|
||||
conf: &'static PageServerConf,
|
||||
src: Arc<dyn Layer>,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
start_lsn: Lsn,
|
||||
oldest_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
|
||||
assert!(oldest_lsn.is_aligned());
|
||||
|
||||
trace!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
start_lsn,
|
||||
);
|
||||
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut seg_sizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
seg_sizes.append(start_lsn, size).unwrap();
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
oldest_lsn,
|
||||
incremental: true,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
file,
|
||||
page_versions: HashMap::new(),
|
||||
seg_sizes,
|
||||
latest_lsn: oldest_lsn,
|
||||
}),
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.end_lsn.is_none()
|
||||
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is inclusive
|
||||
/// `end_lsn` is exclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
if inner.end_lsn.is_some() {
|
||||
assert!(inner.dropped);
|
||||
} else {
|
||||
assert!(!inner.dropped);
|
||||
assert!(self.start_lsn < end_lsn + 1);
|
||||
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
inner.end_lsn = Some(end_lsn);
|
||||
|
||||
if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() {
|
||||
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
}
|
||||
// FIXME
|
||||
/*
|
||||
for perseg in inner.segs.values() {
|
||||
if let Some((lsn, _)) = perseg.seg_sizes.as_slice().last() {
|
||||
assert!(lsn < &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
}
|
||||
|
||||
for (_blk, vec_map) in inner.page_versions.iter() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn <= end_lsn);
|
||||
for (_blk, vec_map) in perseg.page_versions.iter() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/// Write the this frozen in-memory layer to disk.
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns new layers that replace this one.
|
||||
/// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions
|
||||
@@ -678,17 +346,7 @@ impl InMemoryLayer {
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(
|
||||
&self,
|
||||
timeline: &LayeredTimeline,
|
||||
reconstruct_pages: bool,
|
||||
) -> Result<LayersOnDisk> {
|
||||
trace!(
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
self.get_end_lsn()
|
||||
);
|
||||
|
||||
pub fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -700,105 +358,30 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN
|
||||
// that is included.
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
Key::MIN,
|
||||
self.start_lsn..inner.end_lsn.unwrap(),
|
||||
)?;
|
||||
|
||||
// Figure out if we should create a delta layer, image layer, or both.
|
||||
let image_lsn: Option<Lsn>;
|
||||
let delta_end_lsn: Option<Lsn>;
|
||||
if self.is_dropped() || !reconstruct_pages {
|
||||
// The segment was dropped. Create just a delta layer containing all the
|
||||
// changes up to and including the drop.
|
||||
delta_end_lsn = Some(end_lsn_exclusive);
|
||||
image_lsn = None;
|
||||
} else if self.start_lsn == end_lsn_inclusive {
|
||||
// The layer contains exactly one LSN. It's enough to write an image
|
||||
// layer at that LSN.
|
||||
delta_end_lsn = None;
|
||||
image_lsn = Some(end_lsn_inclusive);
|
||||
} else {
|
||||
// Create a delta layer with all the changes up to the end LSN,
|
||||
// and an image layer at the end LSN.
|
||||
//
|
||||
// Note that we the delta layer does *not* include the page versions
|
||||
// at the end LSN. They are included in the image layer, and there's
|
||||
// no need to store them twice.
|
||||
delta_end_lsn = Some(end_lsn_inclusive);
|
||||
image_lsn = Some(end_lsn_inclusive);
|
||||
}
|
||||
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
|
||||
if let Some(delta_end_lsn) = delta_end_lsn {
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
delta_end_lsn,
|
||||
self.is_dropped(),
|
||||
)?;
|
||||
|
||||
// Write all page versions, in block + LSN order
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
|
||||
let pv_iter = inner.page_versions.iter();
|
||||
let mut pages: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> = pv_iter.collect();
|
||||
pages.sort_by_key(|(blknum, _vec_map)| *blknum);
|
||||
for (blknum, vec_map) in pages {
|
||||
let mut do_steps = || -> Result<()> {
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
if *lsn < delta_end_lsn {
|
||||
let len = inner.read_pv_bytes(*pos, &mut buf)?;
|
||||
delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?;
|
||||
}
|
||||
let val = Value::des(&utils::read_blob(&inner.file, *pos)?)?;
|
||||
delta_layer_writer.put_value(*key, *lsn, val)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create seg_sizes
|
||||
let seg_sizes = if delta_end_lsn == end_lsn_exclusive {
|
||||
inner.seg_sizes.clone()
|
||||
} else {
|
||||
inner.seg_sizes.split_at(&end_lsn_exclusive).0
|
||||
};
|
||||
|
||||
let delta_layer = delta_layer_writer.finish(seg_sizes)?;
|
||||
delta_layers.push(delta_layer);
|
||||
Ok(())
|
||||
};
|
||||
if let Err(err) = do_steps() {
|
||||
delta_layer_writer.abort();
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
if let Some(image_lsn) = image_lsn {
|
||||
let size = if self.seg.rel.is_blocky() {
|
||||
self.get_seg_size(image_lsn)?
|
||||
} else {
|
||||
1
|
||||
};
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
image_lsn,
|
||||
size,
|
||||
)?;
|
||||
|
||||
for blknum in 0..size {
|
||||
let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?;
|
||||
|
||||
image_layer_writer.put_page_image(&img)?;
|
||||
}
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers,
|
||||
})
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX)?;
|
||||
Ok(delta_layer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,468 +0,0 @@
|
||||
///
|
||||
/// IntervalTree is data structure for holding intervals. It is generic
|
||||
/// to make unit testing possible, but the only real user of it is the layer map,
|
||||
///
|
||||
/// It's inspired by the "segment tree" or a "statistic tree" as described in
|
||||
/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
|
||||
/// the points instead of a binary tree. This is called an "interval tree" instead
|
||||
/// of "segment tree" because the term "segment" is already using Zenith to mean
|
||||
/// something else. To add to the confusion, there is another data structure known
|
||||
/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
|
||||
/// for storing intervals, but this isn't that.
|
||||
///
|
||||
/// The basic idea is to have a B-tree of "interesting Points". At each Point,
|
||||
/// there is a list of intervals that contain the point. The Points are formed
|
||||
/// from the start bounds of each interval; there is a Point for each distinct
|
||||
/// start bound.
|
||||
///
|
||||
/// Operations:
|
||||
///
|
||||
/// To find intervals that contain a given point, you search the b-tree to find
|
||||
/// the nearest Point <= search key. Then you just return the list of intervals.
|
||||
///
|
||||
/// To insert an interval, find the Point with start key equal to the inserted item.
|
||||
/// If the Point doesn't exist yet, create it, by copying all the items from the
|
||||
/// previous Point that cover the new Point. Then walk right, inserting the new
|
||||
/// interval to all the Points that are contained by the new interval (including the
|
||||
/// newly created Point).
|
||||
///
|
||||
/// To remove an interval, you scan the tree for all the Points that are contained by
|
||||
/// the removed interval, and remove it from the list in each Point.
|
||||
///
|
||||
/// Requirements and assumptions:
|
||||
///
|
||||
/// - Can store overlapping items
|
||||
/// - But there are not many overlapping items
|
||||
/// - The interval bounds don't change after it is added to the tree
|
||||
/// - Intervals are uniquely identified by pointer equality. You must not be insert the
|
||||
/// same interval object twice, and `remove` uses pointer equality to remove the right
|
||||
/// interval. It is OK to have two intervals with the same bounds, however.
|
||||
///
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct IntervalTree<I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
points: BTreeMap<I::Key, Point<I>>,
|
||||
}
|
||||
|
||||
struct Point<I: ?Sized> {
|
||||
/// All intervals that contain this point, in no particular order.
|
||||
///
|
||||
/// We assume that there aren't a lot of overlappingg intervals, so that this vector
|
||||
/// never grows very large. If that assumption doesn't hold, we could keep this ordered
|
||||
/// by the end bound, to speed up `search`. But as long as there are only a few elements,
|
||||
/// a linear search is OK.
|
||||
elements: Vec<Arc<I>>,
|
||||
}
|
||||
|
||||
/// Abstraction for an interval that can be stored in the tree
|
||||
///
|
||||
/// The start bound is inclusive and the end bound is exclusive. End must be greater
|
||||
/// than start.
|
||||
pub trait IntervalItem {
|
||||
type Key: Ord + Copy + Debug + Sized;
|
||||
|
||||
fn start_key(&self) -> Self::Key;
|
||||
fn end_key(&self) -> Self::Key;
|
||||
|
||||
fn bounds(&self) -> Range<Self::Key> {
|
||||
self.start_key()..self.end_key()
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
/// Return an element that contains 'key', or precedes it.
|
||||
///
|
||||
/// If there are multiple candidates, returns the one with the highest 'end' key.
|
||||
pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
|
||||
// Find the greatest point that precedes or is equal to the search key. If there is
|
||||
// none, returns None.
|
||||
let (_, p) = self.points.range(..=key).next_back()?;
|
||||
|
||||
// Find the element with the highest end key at this point
|
||||
let highest_item = p
|
||||
.elements
|
||||
.iter()
|
||||
.reduce(|a, b| {
|
||||
// starting with Rust 1.53, could use `std::cmp::min_by_key` here
|
||||
if a.end_key() > b.end_key() {
|
||||
a
|
||||
} else {
|
||||
b
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
Some(Arc::clone(highest_item))
|
||||
}
|
||||
|
||||
/// Iterate over all items with start bound >= 'key'
|
||||
pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(key..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over all items
|
||||
pub fn iter(&self) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, item: Arc<I>) {
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
assert!(start_key < end_key);
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
// Find the starting point and walk forward from there
|
||||
let mut found_start_point = false;
|
||||
let iter = self.points.range_mut(bounds);
|
||||
for (point_key, point) in iter {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
// It is an error to insert the same item to the tree twice.
|
||||
assert!(
|
||||
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
|
||||
"interval is already in the tree"
|
||||
);
|
||||
}
|
||||
point.elements.push(Arc::clone(&item));
|
||||
}
|
||||
if !found_start_point {
|
||||
// Create a new Point for the starting point
|
||||
|
||||
// Look at the previous point, and copy over elements that overlap with this
|
||||
// new point
|
||||
let mut new_elements: Vec<Arc<I>> = Vec::new();
|
||||
if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
|
||||
let overlapping_prev_elements = prev_point
|
||||
.elements
|
||||
.iter()
|
||||
.filter(|x| x.bounds().contains(&start_key))
|
||||
.cloned();
|
||||
|
||||
new_elements.extend(overlapping_prev_elements);
|
||||
}
|
||||
new_elements.push(item);
|
||||
|
||||
let new_point = Point {
|
||||
elements: new_elements,
|
||||
};
|
||||
self.points.insert(start_key, new_point);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, item: &Arc<I>) {
|
||||
// range search points
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
let mut points_to_remove: Vec<I::Key> = Vec::new();
|
||||
let mut found_start_point = false;
|
||||
for (point_key, point) in self.points.range_mut(bounds) {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
}
|
||||
let len_before = point.elements.len();
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, item));
|
||||
let len_after = point.elements.len();
|
||||
assert_eq!(len_after + 1, len_before);
|
||||
if len_after == 0 {
|
||||
points_to_remove.push(*point_key);
|
||||
}
|
||||
}
|
||||
assert!(found_start_point);
|
||||
|
||||
for k in points_to_remove {
|
||||
self.points.remove(&k).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntervalIter<'a, I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
|
||||
elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for IntervalIter<'a, I>
|
||||
where
|
||||
I: IntervalItem + ?Sized,
|
||||
{
|
||||
type Item = Arc<I>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// Iterate over all elements in all the points in 'point_iter'. To avoid
|
||||
// returning the same element twice, we only return each element at its
|
||||
// starting point.
|
||||
loop {
|
||||
// Return next remaining element from the current point
|
||||
if let Some((point_key, elem_iter)) = &mut self.elem_iter {
|
||||
for elem in elem_iter {
|
||||
if elem.start_key() == *point_key {
|
||||
return Some(Arc::clone(elem));
|
||||
}
|
||||
}
|
||||
}
|
||||
// No more elements at this point. Move to next point.
|
||||
if let Some((point_key, point)) = self.point_iter.next() {
|
||||
self.elem_iter = Some((*point_key, point.elements.iter()));
|
||||
continue;
|
||||
} else {
|
||||
// No more points, all done
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> Default for IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
fn default() -> Self {
|
||||
IntervalTree {
|
||||
points: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MockItem {
|
||||
start_key: u32,
|
||||
end_key: u32,
|
||||
val: String,
|
||||
}
|
||||
impl IntervalItem for MockItem {
|
||||
type Key = u32;
|
||||
|
||||
fn start_key(&self) -> u32 {
|
||||
self.start_key
|
||||
}
|
||||
fn end_key(&self) -> u32 {
|
||||
self.end_key
|
||||
}
|
||||
}
|
||||
impl MockItem {
|
||||
fn new(start_key: u32, end_key: u32) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}", start_key, end_key),
|
||||
}
|
||||
}
|
||||
fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}: {}", start_key, end_key, val),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl fmt::Display for MockItem {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
expected: &[&str],
|
||||
) -> Option<Arc<MockItem>> {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
|
||||
Some(v)
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
|
||||
let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
|
||||
contents.sort();
|
||||
assert_eq!(contents, expected);
|
||||
}
|
||||
|
||||
fn dump_tree(tree: &IntervalTree<MockItem>) {
|
||||
for (point_key, point) in tree.points.iter() {
|
||||
print!("{}:", point_key);
|
||||
for e in point.elements.iter() {
|
||||
print!(" {}", e);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_simple() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Simple, non-overlapping ranges.
|
||||
tree.insert(Arc::new(MockItem::new(10, 11)));
|
||||
tree.insert(Arc::new(MockItem::new(11, 12)));
|
||||
tree.insert(Arc::new(MockItem::new(12, 13)));
|
||||
tree.insert(Arc::new(MockItem::new(18, 19)));
|
||||
tree.insert(Arc::new(MockItem::new(17, 18)));
|
||||
tree.insert(Arc::new(MockItem::new(15, 16)));
|
||||
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &["10-11"]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["12-13"]);
|
||||
assert_search(&tree, 13, &["12-13"]);
|
||||
assert_search(&tree, 14, &["12-13"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 16, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["18-19"]);
|
||||
assert_search(&tree, 19, &["18-19"]);
|
||||
assert_search(&tree, 20, &["18-19"]);
|
||||
|
||||
// remove a few entries and search around them again
|
||||
tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
|
||||
tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
|
||||
tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &[]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["11-12"]);
|
||||
assert_search(&tree, 14, &["11-12"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["17-18"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_overlap() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Overlapping items
|
||||
tree.insert(Arc::new(MockItem::new(22, 24)));
|
||||
tree.insert(Arc::new(MockItem::new(23, 25)));
|
||||
let x24_26 = Arc::new(MockItem::new(24, 26));
|
||||
tree.insert(Arc::clone(&x24_26));
|
||||
let x26_28 = Arc::new(MockItem::new(26, 28));
|
||||
tree.insert(Arc::clone(&x26_28));
|
||||
tree.insert(Arc::new(MockItem::new(25, 27)));
|
||||
|
||||
assert_search(&tree, 22, &["22-24"]);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25", "24-26"]);
|
||||
assert_search(&tree, 25, &["24-26", "25-27"]);
|
||||
assert_search(&tree, 26, &["25-27", "26-28"]);
|
||||
assert_search(&tree, 27, &["26-28"]);
|
||||
assert_search(&tree, 28, &["26-28"]);
|
||||
assert_search(&tree, 29, &["26-28"]);
|
||||
|
||||
tree.remove(&x24_26);
|
||||
tree.remove(&x26_28);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25"]);
|
||||
assert_search(&tree, 25, &["25-27"]);
|
||||
assert_search(&tree, 26, &["25-27"]);
|
||||
assert_search(&tree, 27, &["25-27"]);
|
||||
assert_search(&tree, 28, &["25-27"]);
|
||||
assert_search(&tree, 29, &["25-27"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_nested() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Items containing other items
|
||||
tree.insert(Arc::new(MockItem::new(31, 39)));
|
||||
tree.insert(Arc::new(MockItem::new(32, 34)));
|
||||
tree.insert(Arc::new(MockItem::new(33, 35)));
|
||||
tree.insert(Arc::new(MockItem::new(30, 40)));
|
||||
|
||||
assert_search(&tree, 30, &["30-40"]);
|
||||
assert_search(&tree, 31, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
|
||||
assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
|
||||
assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
|
||||
assert_search(&tree, 35, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 36, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 37, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 38, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 39, &["30-40"]);
|
||||
assert_search(&tree, 40, &["30-40"]);
|
||||
assert_search(&tree, 41, &["30-40"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_duplicates() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Duplicate keys
|
||||
let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
|
||||
tree.insert(Arc::clone(&item_a));
|
||||
let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
|
||||
tree.insert(Arc::clone(&item_b));
|
||||
let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
|
||||
tree.insert(Arc::clone(&item_c));
|
||||
let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
|
||||
tree.insert(Arc::clone(&item_d));
|
||||
let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
|
||||
tree.insert(Arc::clone(&item_e));
|
||||
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_search(
|
||||
&tree,
|
||||
55,
|
||||
&["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
|
||||
);
|
||||
tree.remove(&item_b);
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
|
||||
|
||||
tree.remove(&item_d);
|
||||
dump_tree(&tree);
|
||||
assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_interval_tree_insert_twice() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Inserting the same item twice is not cool
|
||||
let item = Arc::new(MockItem::new(1, 2));
|
||||
tree.insert(Arc::clone(&item));
|
||||
tree.insert(Arc::clone(&item)); // fails assertion
|
||||
}
|
||||
}
|
||||
@@ -3,30 +3,26 @@
|
||||
//!
|
||||
//! When the timeline is first accessed, the server lists of all layer files
|
||||
//! in the timelines/<timelineid> directory, and populates this map with
|
||||
//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL
|
||||
//! is received, we create InMemoryLayers to hold the incoming records. Now and
|
||||
//! then, in the checkpoint() function, the in-memory layers are frozen, forming
|
||||
//! new image and delta layers and corresponding files are written to disk.
|
||||
//! ImageLayer and DeltaLayer structs corresponding to each file. When the first
|
||||
//! new WAL record is received, we create an InMemoryLayer to hold the incoming
|
||||
//! records. Now and then, in the checkpoint() function, the in-memory layer is
|
||||
//! are frozen, and it is split up into new image and delta layers and the
|
||||
//! corresponding files are written to disk.
|
||||
//!
|
||||
|
||||
use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
|
||||
use crate::layered_repository::storage_layer::range_overlaps;
|
||||
use crate::layered_repository::storage_layer::Layer;
|
||||
use crate::layered_repository::InMemoryLayer;
|
||||
use crate::relish::*;
|
||||
use crate::repository::Key;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_INMEMORY_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
|
||||
.expect("failed to define a metric");
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
|
||||
.expect("failed to define a metric");
|
||||
@@ -37,108 +33,147 @@ lazy_static! {
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
/// All the layers keyed by segment tag
|
||||
segs: HashMap<SegmentTag, SegEntry>,
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
|
||||
// where the start LSN of the next InMemoryLayer that is to be created.
|
||||
//
|
||||
pub open_layer: Option<Arc<InMemoryLayer>>,
|
||||
pub next_open_layer_at: Option<Lsn>,
|
||||
|
||||
/// All in-memory layers, ordered by 'oldest_lsn' and generation
|
||||
/// of each layer. This allows easy access to the in-memory layer that
|
||||
/// contains the oldest WAL record.
|
||||
open_layers: BinaryHeap<OpenLayerEntry>,
|
||||
///
|
||||
/// The frozen layer, if any, contains WAL older than the current 'open_layer'
|
||||
/// or 'next_open_layer_at', but newer than any historic layer. The frozen
|
||||
/// layer is during checkpointing, when an InMemoryLayer is being written out
|
||||
/// to disk.
|
||||
///
|
||||
pub frozen_layer: Option<Arc<InMemoryLayer>>,
|
||||
|
||||
/// Generation number, used to distinguish newly inserted entries in the
|
||||
/// binary heap from older entries during checkpoint.
|
||||
current_generation: u64,
|
||||
/// All the historic layers are kept here
|
||||
|
||||
/// TODO: This is a placeholder implementation of a data structure
|
||||
/// to hold information about all the layer files on disk and in
|
||||
/// S3. Currently, it's just a vector and all operations perform a
|
||||
/// linear scan over it. That obviously becomes slow as the
|
||||
/// number of layers grows. I'm imagining that an R-tree or some
|
||||
/// other 2D data structure would be the long-term solution here.
|
||||
historic_layers: Vec<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<dyn Layer>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Look up a layer using the given segment tag and LSN. This differs from a
|
||||
/// plain key-value lookup in that if there is any layer that covers the
|
||||
/// given LSN, or precedes the given LSN, it is returned. In other words,
|
||||
/// you don't need to know the exact start LSN of the layer.
|
||||
///
|
||||
pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
pub fn search(&self, key: Key, lsn: Lsn) -> Result<Option<SearchResult>> {
|
||||
// linear search
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
|
||||
segentry.get(lsn)
|
||||
}
|
||||
if img_lsn > lsn {
|
||||
// too new
|
||||
continue;
|
||||
}
|
||||
if img_lsn == lsn {
|
||||
// found exact match
|
||||
return Ok(Some(SearchResult {
|
||||
layer: Arc::clone(l),
|
||||
lsn_floor: lsn,
|
||||
}));
|
||||
}
|
||||
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
|
||||
latest_img = Some(Arc::clone(l));
|
||||
latest_img_lsn = Some(img_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Get the open layer for given segment for writing. Or None if no open
|
||||
/// layer exists.
|
||||
///
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
segentry
|
||||
.open_layer_id
|
||||
.and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert an open in-memory layer
|
||||
///
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
let layer_id = segentry.update_open(Arc::clone(&layer));
|
||||
|
||||
let oldest_lsn = layer.get_oldest_lsn();
|
||||
|
||||
// After a crash and restart, 'oldest_lsn' of the oldest in-memory
|
||||
// layer becomes the WAL streaming starting point, so it better not point
|
||||
// in the middle of a WAL record.
|
||||
assert!(oldest_lsn.is_aligned());
|
||||
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
oldest_lsn: layer.get_oldest_lsn(),
|
||||
layer_id,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_layers.push(open_layer_entry);
|
||||
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
}
|
||||
|
||||
/// Remove an open in-memory layer
|
||||
pub fn remove_open(&mut self, layer_id: LayerId) {
|
||||
// Note: we don't try to remove the entry from the binary heap.
|
||||
// It will be removed lazily by peek_oldest_open() when it's made it to
|
||||
// the top of the heap.
|
||||
|
||||
let layer_opt = {
|
||||
let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
|
||||
let layer_opt = global_map.get(&layer_id);
|
||||
global_map.remove(&layer_id);
|
||||
// TODO it's bad that a ref can still exist after being evicted from cache
|
||||
layer_opt
|
||||
};
|
||||
|
||||
if let Some(layer) = layer_opt {
|
||||
let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
|
||||
|
||||
if segentry.open_layer_id == Some(layer_id) {
|
||||
// Also remove it from the SegEntry of this segment
|
||||
segentry.open_layer_id = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!layer.is_writeable());
|
||||
assert!(layer.is_dropped());
|
||||
// Search the delta layers
|
||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
if l.get_lsn_range().start > lsn {
|
||||
// too new
|
||||
continue;
|
||||
}
|
||||
|
||||
if l.get_lsn_range().end > lsn {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
info!(
|
||||
"found layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
lsn
|
||||
);
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
break;
|
||||
}
|
||||
// this layer's end LSN is smaller than the requested point. If there's
|
||||
// nothing newer, this is what we need to return. Remember this.
|
||||
if let Some(ref old_candidate) = latest_delta {
|
||||
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
} else {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
}
|
||||
if let Some(l) = latest_delta {
|
||||
info!(
|
||||
"found (old) layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
lsn
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap_or(l.get_lsn_range().start),
|
||||
layer: l,
|
||||
}))
|
||||
} else if let Some(l) = latest_img {
|
||||
info!("found img layer and no deltas for request on {} at {}", key, lsn);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
info!("no layer found for request on {} at {}", key, lsn);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn image_exists(&self, key_range: &Range<Key>, lsn: Lsn) -> bool {
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental()
|
||||
&& l.get_key_range() == *key_range
|
||||
&& l.get_lsn_range().start == lsn
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
segentry.insert_historic(layer);
|
||||
|
||||
self.historic_layers.push(layer);
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
@@ -147,61 +182,63 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
#[allow(dead_code)]
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let tag = layer.get_seg_tag();
|
||||
let len_before = self.historic_layers.len();
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.remove(&layer);
|
||||
}
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.historic_layers
|
||||
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||
|
||||
assert_eq!(self.historic_layers.len(), len_before - 1);
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
// List relations along with a flag that marks if they exist at the given lsn.
|
||||
// spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases.
|
||||
// Pass Tag if we're only interested in some relations.
|
||||
pub fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashMap<RelishTag, bool>> {
|
||||
let mut rels: HashMap<RelishTag, bool> = HashMap::new();
|
||||
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
match seg.rel {
|
||||
RelishTag::Relation(reltag) => {
|
||||
if let Some(request_rel) = tag {
|
||||
if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
|
||||
&& (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
|
||||
{
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if tag == None {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Is there a newer image layer for given segment?
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
/// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
|
||||
/// We also only look at historic layers
|
||||
//#[allow(dead_code)]
|
||||
pub fn newer_image_layer_exists(
|
||||
&self,
|
||||
seg: SegmentTag,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> bool {
|
||||
if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.newer_image_layer_exists(lsn, disk_consistent_lsn)
|
||||
} else {
|
||||
false
|
||||
) -> Result<bool> {
|
||||
|
||||
let mut range_remain = key_range.clone();
|
||||
|
||||
loop {
|
||||
let mut made_progress = false;
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
if !l.is_incremental() &&
|
||||
l.get_key_range().contains(&range_remain.start) &&
|
||||
img_lsn > lsn &&
|
||||
img_lsn < disk_consistent_lsn
|
||||
{
|
||||
made_progress = true;
|
||||
let img_key_end = l.get_key_range().end;
|
||||
|
||||
if img_key_end >= range_remain.end {
|
||||
return Ok(true);
|
||||
}
|
||||
range_remain.start = img_key_end;
|
||||
}
|
||||
}
|
||||
|
||||
if !made_progress {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,284 +248,139 @@ impl LayerMap {
|
||||
/// used for garbage collection, to determine if some alive layer
|
||||
/// exists at the lsn. If so, we shouldn't delete a newer dropped layer
|
||||
/// to avoid incorrectly making it visible.
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.exists_at_lsn(lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
/*
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.historic_layers.get(&seg) {
|
||||
segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
}
|
||||
*/
|
||||
|
||||
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
|
||||
self.historic_layers.iter()
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
|
||||
let global_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
// Find the last image layer that covers the key
|
||||
let mut candidate_lsn = Lsn(0);
|
||||
let mut candidate = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
|
||||
while let Some(oldest_entry) = self.open_layers.peek() {
|
||||
if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
|
||||
return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
|
||||
} else {
|
||||
self.open_layers.pop();
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let this_lsn = l.get_lsn_range().start;
|
||||
if this_lsn > lsn {
|
||||
continue;
|
||||
}
|
||||
if this_lsn < candidate_lsn {
|
||||
// our previous candidate was better
|
||||
continue;
|
||||
}
|
||||
candidate_lsn = this_lsn;
|
||||
candidate = Some(Arc::clone(l));
|
||||
}
|
||||
|
||||
candidate
|
||||
}
|
||||
|
||||
///
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range. (This is used when creating new
|
||||
/// image layers)
|
||||
///
|
||||
// FIXME: clippy complains that the result type is very complex. She's probably
|
||||
// right...
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn image_coverage(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
let mut points: Vec<Key>;
|
||||
|
||||
points = vec![key_range.start];
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.get_lsn_range().start > lsn {
|
||||
continue;
|
||||
}
|
||||
let range = l.get_key_range();
|
||||
if key_range.contains(&range.start) {
|
||||
points.push(l.get_key_range().start);
|
||||
}
|
||||
if key_range.contains(&range.end) {
|
||||
points.push(l.get_key_range().end);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
points.push(key_range.end);
|
||||
|
||||
/// Increment the generation number used to stamp open in-memory layers. Layers
|
||||
/// added with `insert_open` after this call will be associated with the new
|
||||
/// generation. Returns the new generation number.
|
||||
pub fn increment_generation(&mut self) -> u64 {
|
||||
self.current_generation += 1;
|
||||
self.current_generation
|
||||
}
|
||||
points.sort();
|
||||
points.dedup();
|
||||
|
||||
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
|
||||
HistoricLayerIter {
|
||||
seg_iter: self.segs.iter(),
|
||||
iter: None,
|
||||
// Ok, we now have a list of "interesting" points in the key space
|
||||
|
||||
// For each range between the points, find the latest image
|
||||
let mut start = *points.first().unwrap();
|
||||
let mut ranges = Vec::new();
|
||||
for end in points[1..].iter() {
|
||||
let img = self.find_latest_image(start, lsn);
|
||||
|
||||
ranges.push((start..*end, img));
|
||||
|
||||
start = *end;
|
||||
}
|
||||
Ok(ranges)
|
||||
}
|
||||
|
||||
pub fn get_deltas(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
let mut deltas = Vec::new();
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_lsn_range(), lsn_range) {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_key_range(), key_range) {
|
||||
continue;
|
||||
}
|
||||
deltas.push(Arc::clone(l));
|
||||
}
|
||||
Ok(deltas)
|
||||
}
|
||||
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
let mut deltas = Vec::new();
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if l.get_key_range() != (Key::MIN..Key::MAX) {
|
||||
continue;
|
||||
}
|
||||
deltas.push(Arc::clone(l));
|
||||
}
|
||||
Ok(deltas)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let Some(open) = &segentry.open_layer_id {
|
||||
if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
|
||||
layer.dump()?;
|
||||
} else {
|
||||
println!("layer not found in global map");
|
||||
}
|
||||
}
|
||||
|
||||
for layer in segentry.historic.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
for layer in self.historic_layers.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl IntervalItem for dyn Layer {
|
||||
type Key = Lsn;
|
||||
|
||||
fn start_key(&self) -> Lsn {
|
||||
self.get_start_lsn()
|
||||
}
|
||||
fn end_key(&self) -> Lsn {
|
||||
self.get_end_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
|
||||
/// associated with the segment.
|
||||
///
|
||||
/// The last layer that is open for writes is always an InMemoryLayer,
|
||||
/// and is kept in a separate field, because there can be only one for
|
||||
/// each segment. The older layers, stored on disk, are kept in an
|
||||
/// IntervalTree.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open_layer_id: Option<LayerId>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
}
|
||||
|
||||
impl SegEntry {
|
||||
/// Does the segment exist at given LSN?
|
||||
/// Return None if object is not found in this SegEntry.
|
||||
fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
|
||||
if let Some(layer) = self.get(lsn) {
|
||||
Ok(Some(layer.get_seg_exists(lsn)?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
if let Some(open_layer_id) = &self.open_layer_id {
|
||||
let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
|
||||
if open_layer.get_start_lsn() <= lsn {
|
||||
return Some(open_layer);
|
||||
}
|
||||
}
|
||||
|
||||
self.historic.search(lsn)
|
||||
}
|
||||
|
||||
pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool {
|
||||
// We only check on-disk layers, because
|
||||
// in-memory layers are not durable
|
||||
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
self.historic
|
||||
.iter_newer(lsn)
|
||||
.any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1)
|
||||
}
|
||||
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
if let Some(prev_open_layer_id) = &self.open_layer_id {
|
||||
if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
|
||||
{
|
||||
assert!(!prev_open_layer.is_writeable());
|
||||
}
|
||||
}
|
||||
let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
|
||||
self.open_layer_id = Some(open_layer_id);
|
||||
open_layer_id
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
self.historic.insert(layer);
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry held in LayerMap::open_layers, with boilerplate comparison routines
|
||||
/// to implement a min-heap ordered by 'oldest_lsn' and 'generation'
|
||||
///
|
||||
/// The generation number associated with each entry can be used to distinguish
|
||||
/// recently-added entries (i.e after last call to increment_generation()) from older
|
||||
/// entries with the same 'oldest_lsn'.
|
||||
struct OpenLayerEntry {
|
||||
oldest_lsn: Lsn, // copy of layer.get_oldest_lsn()
|
||||
generation: u64,
|
||||
layer_id: LayerId,
|
||||
}
|
||||
impl Ord for OpenLayerEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that. Entries with identical oldest_lsn are ordered by generation
|
||||
other
|
||||
.oldest_lsn
|
||||
.cmp(&self.oldest_lsn)
|
||||
.then_with(|| other.generation.cmp(&self.generation))
|
||||
}
|
||||
}
|
||||
impl PartialOrd for OpenLayerEntry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for OpenLayerEntry {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for OpenLayerEntry {}
|
||||
|
||||
/// Iterator returned by LayerMap::iter_historic_layers()
|
||||
pub struct HistoricLayerIter<'a> {
|
||||
seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<IntervalIter<'a, dyn Layer>>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
type Item = Arc<dyn Layer>;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
|
||||
loop {
|
||||
if let Some(x) = &mut self.iter {
|
||||
if let Some(x) = x.next() {
|
||||
return Some(Arc::clone(&x));
|
||||
}
|
||||
}
|
||||
if let Some((_tag, segentry)) = self.seg_iter.next() {
|
||||
self.iter = Some(segentry.historic.iter());
|
||||
continue;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::config::PageServerConf;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
lazy_static! {
|
||||
static ref DUMMY_TIMELINEID: ZTimelineId =
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
|
||||
static ref DUMMY_TENANTID: ZTenantId =
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap();
|
||||
}
|
||||
|
||||
/// Construct a dummy InMemoryLayer for testing
|
||||
fn dummy_inmem_layer(
|
||||
conf: &'static PageServerConf,
|
||||
segno: u32,
|
||||
start_lsn: Lsn,
|
||||
oldest_lsn: Lsn,
|
||||
) -> Arc<InMemoryLayer> {
|
||||
Arc::new(
|
||||
InMemoryLayer::create(
|
||||
conf,
|
||||
*DUMMY_TIMELINEID,
|
||||
*DUMMY_TENANTID,
|
||||
SegmentTag {
|
||||
rel: TESTREL_A,
|
||||
segno,
|
||||
},
|
||||
start_lsn,
|
||||
oldest_lsn,
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_open_layers() -> Result<()> {
|
||||
let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
|
||||
|
||||
let mut layers = LayerMap::default();
|
||||
|
||||
let gen1 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
|
||||
|
||||
let gen2 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
|
||||
|
||||
// A helper function (closure) to pop the next oldest open entry from the layer map,
|
||||
// and assert that it is what we'd expect
|
||||
let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
|
||||
let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
|
||||
assert!(l.get_seg_tag().segno == expected_segno);
|
||||
assert!(generation == expected_generation);
|
||||
layers.remove_open(layer_id);
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
assert_pop_layer(5, gen2); // 0x100
|
||||
assert_pop_layer(3, gen1); // 0x110
|
||||
assert_pop_layer(4, gen2); // 0x110
|
||||
assert_pop_layer(2, gen1); // 0x120
|
||||
assert_pop_layer(1, gen1); // 0x200
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,75 +2,34 @@
|
||||
//! Common traits and structs for layers
|
||||
//!
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::{BlockNumber, ZenithWalRecord};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Size of one segment in pages (10 MB)
|
||||
pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
// in # of key-value pairs
|
||||
// FIXME Size of one segment in pages (128 MB)
|
||||
pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024;
|
||||
pub const TARGET_FILE_SIZE: u32 = (TARGET_FILE_SIZE_BYTES / 8192) as u32;
|
||||
|
||||
///
|
||||
/// Each relish stored in the repository is divided into fixed-sized "segments",
|
||||
/// with 10 MB of key-space, or 1280 8k pages each.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct SegmentTag {
|
||||
pub rel: RelishTag,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
/// SegmentBlk represents a block number within a segment, or the size of segment.
|
||||
///
|
||||
/// This is separate from BlockNumber, which is used for block number within the
|
||||
/// whole relish. Since this is just a type alias, the compiler will let you mix
|
||||
/// them freely, but we use the type alias as documentation to make it clear
|
||||
/// which one we're dealing with.
|
||||
///
|
||||
/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally
|
||||
/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes
|
||||
/// operations more verbose).
|
||||
pub type SegmentBlk = u32;
|
||||
|
||||
impl fmt::Display for SegmentTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}.{}", self.rel, self.segno)
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
{
|
||||
if a.start < b.start {
|
||||
a.end > b.start
|
||||
} else {
|
||||
b.end > a.start
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentTag {
|
||||
/// Given a relish and block number, calculate the corresponding segment and
|
||||
/// block number within the segment.
|
||||
pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) {
|
||||
(
|
||||
SegmentTag {
|
||||
rel,
|
||||
segno: blknum / RELISH_SEG_SIZE,
|
||||
},
|
||||
blknum % RELISH_SEG_SIZE,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Represents a version of a page at a specific LSN. The LSN is the key of the
|
||||
/// entry in the 'page_versions' hash, it is not duplicated here.
|
||||
///
|
||||
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||
/// to be applied over the previous page version to reconstruct this version.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum PageVersion {
|
||||
Page(Bytes),
|
||||
Wal(ZenithWalRecord),
|
||||
}
|
||||
|
||||
///
|
||||
/// FIXME
|
||||
/// Struct used to communicate across calls to 'get_page_reconstruct_data'.
|
||||
///
|
||||
/// Before first call to get_page_reconstruct_data, you can fill in 'page_img'
|
||||
@@ -88,25 +47,32 @@ pub enum PageVersion {
|
||||
/// the same PageReconstructData struct in the next 'get_page_reconstruct_data'
|
||||
/// call, to collect more records.
|
||||
///
|
||||
pub struct PageReconstructData {
|
||||
#[derive(Debug)]
|
||||
pub struct ValueReconstructState {
|
||||
pub key: Key,
|
||||
pub lsn: Lsn,
|
||||
pub records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
pub page_img: Option<(Lsn, Bytes)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
|
||||
pub request_lsn: Lsn, // original request's LSN, for debugging purposes
|
||||
}
|
||||
|
||||
/// Return value from Layer::get_page_reconstruct_data
|
||||
pub enum PageReconstructResult {
|
||||
#[derive(Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue(Lsn),
|
||||
Continue,
|
||||
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing(Lsn),
|
||||
Missing,
|
||||
}
|
||||
|
||||
///
|
||||
/// FIXME
|
||||
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access
|
||||
@@ -120,21 +86,17 @@ pub trait Layer: Send + Sync {
|
||||
/// Identify the timeline this relish belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
|
||||
/// Identify the relish segment
|
||||
fn get_seg_tag(&self) -> SegmentTag;
|
||||
/// Range of segments that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
/// FIXME
|
||||
/// Inclusive start bound of the LSN range that this layer holds
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// Exclusive end bound of the LSN range that this layer holds.
|
||||
///
|
||||
/// - For an open in-memory layer, this is MAX_LSN.
|
||||
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_end_lsn(&self) -> Lsn;
|
||||
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
fn is_dropped(&self) -> bool;
|
||||
fn get_lsn_range(&self) -> Range<Lsn>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
@@ -153,18 +115,11 @@ pub trait Layer: Send + Sync {
|
||||
/// is available. If this returns PageReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult>;
|
||||
|
||||
/// Return size of the segment at given LSN. (Only for blocky relations.)
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk>;
|
||||
|
||||
/// Does the segment exist at given LSN? Or was it dropped before it.
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
|
||||
lsn_floor: Lsn,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Does this layer only contain some data for the segment (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
@@ -175,6 +130,11 @@ pub trait Layer: Send + Sync {
|
||||
/// Returns true for layers that are represented in memory.
|
||||
fn is_in_memory(&self) -> bool;
|
||||
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
||||
|
||||
/// Return a set of all distinct Keys present in this layer
|
||||
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()>;
|
||||
|
||||
/// Release memory used by this layer. There is no corresponding 'load'
|
||||
/// function, that's done implicitly when you call one of the get-functions.
|
||||
fn unload(&self) -> Result<()>;
|
||||
|
||||
48
pageserver/src/layered_repository/utils.rs
Normal file
48
pageserver/src/layered_repository/utils.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
// Utilities for reading and writing Values
|
||||
use std::io::{Error, Write};
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
use bookfile::BoundedReader;
|
||||
|
||||
pub fn read_blob<F: FileExt>(file: &F, off: u64) -> Result<Vec<u8>, Error> {
|
||||
// read length
|
||||
let mut len_buf = [0u8; 4];
|
||||
file.read_exact_at(&mut len_buf, off)?;
|
||||
|
||||
let len = u32::from_ne_bytes(len_buf);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
buf.resize(len as usize, 0);
|
||||
file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?;
|
||||
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub fn read_blob_from_chapter<F: FileExt>(
|
||||
file: &BoundedReader<&F>,
|
||||
off: u64,
|
||||
) -> Result<Vec<u8>, Error> {
|
||||
// read length
|
||||
let mut len_buf = [0u8; 4];
|
||||
file.read_exact_at(&mut len_buf, off)?;
|
||||
|
||||
let len = u32::from_ne_bytes(len_buf);
|
||||
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
buf.resize(len as usize, 0);
|
||||
file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?;
|
||||
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub fn write_blob<W: Write>(writer: &mut W, buf: &[u8]) -> Result<u64, Error> {
|
||||
let val_len = buf.len() as u32;
|
||||
|
||||
// write the 'length' field and kind byte.
|
||||
let lenbuf = u32::to_ne_bytes(val_len);
|
||||
|
||||
writer.write_all(&lenbuf)?;
|
||||
writer.write_all(buf)?;
|
||||
|
||||
Ok(4 + val_len as u64)
|
||||
}
|
||||
@@ -6,6 +6,7 @@ pub mod import_datadir;
|
||||
pub mod layered_repository;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod relish;
|
||||
pub mod remote_storage;
|
||||
pub mod repository;
|
||||
@@ -22,7 +23,8 @@ use lazy_static::lazy_static;
|
||||
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use layered_repository::{LayeredRepository, LayeredTimeline};
|
||||
use layered_repository::LayeredRepository;
|
||||
use pgdatadir_mapping::DatadirTimeline;
|
||||
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
@@ -47,5 +49,5 @@ pub enum CheckpointConfig {
|
||||
}
|
||||
|
||||
pub type RepositoryImpl = LayeredRepository;
|
||||
pub type TimelineImpl = LayeredTimeline;
|
||||
|
||||
pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
|
||||
|
||||
@@ -32,8 +32,10 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::relish::*;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -395,8 +397,8 @@ impl PageServerHandler {
|
||||
/// In either case, if the page server hasn't received the WAL up to the
|
||||
/// requested LSN yet, we will wait for it to arrive. The return value is
|
||||
/// the LSN that should be used to look up the page versions.
|
||||
fn wait_or_get_last_lsn<T: Timeline>(
|
||||
timeline: &T,
|
||||
fn wait_or_get_last_lsn<R: Repository>(
|
||||
timeline: &DatadirTimeline<R>,
|
||||
mut lsn: Lsn,
|
||||
latest: bool,
|
||||
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
|
||||
@@ -423,7 +425,7 @@ impl PageServerHandler {
|
||||
if lsn <= last_record_lsn {
|
||||
lsn = last_record_lsn;
|
||||
} else {
|
||||
timeline.wait_lsn(lsn)?;
|
||||
timeline.tline.wait_lsn(lsn)?;
|
||||
// Since we waited for 'lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the
|
||||
// last-record LSN can advance immediately after we return
|
||||
@@ -433,7 +435,7 @@ impl PageServerHandler {
|
||||
if lsn == Lsn(0) {
|
||||
bail!("invalid LSN(0) in request");
|
||||
}
|
||||
timeline.wait_lsn(lsn)?;
|
||||
timeline.tline.wait_lsn(lsn)?;
|
||||
}
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
@@ -443,54 +445,47 @@ impl PageServerHandler {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
fn handle_get_rel_exists_request<T: Timeline>(
|
||||
fn handle_get_rel_exists_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
|
||||
let exists = timeline.get_rel_exists(tag, lsn)?;
|
||||
let exists = timeline.get_rel_exists(req.rel, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
exists,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_nblocks_request<T: Timeline>(
|
||||
fn handle_get_nblocks_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
|
||||
let n_blocks = timeline.get_relish_size(tag, lsn)?;
|
||||
|
||||
// Return 0 if relation is not found.
|
||||
// This is what postgres smgr expects.
|
||||
let n_blocks = n_blocks.unwrap_or(0);
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_page_at_lsn_request<T: Timeline>(
|
||||
fn handle_get_page_at_lsn_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
.entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delayed causes the requests to
|
||||
@@ -500,7 +495,7 @@ impl PageServerHandler {
|
||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
||||
}
|
||||
*/
|
||||
let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
|
||||
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -520,7 +515,7 @@ impl PageServerHandler {
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
|
||||
.context("Cannot handle basebackup request for a remote timeline")?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
@@ -699,67 +694,19 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layers_total"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layers_not_updated"),
|
||||
RowDescriptor::int8_col(b"layers_removed"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_as_tombstone
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_as_tombstone
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.layers_total.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_branches.to_string().as_bytes()),
|
||||
Some(result.layers_not_updated.to_string().as_bytes()),
|
||||
Some(result.layers_removed.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -779,7 +726,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
|
||||
.context("Failed to fetch local timeline for checkpoint request")?;
|
||||
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
timeline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
|
||||
1173
pageserver/src/pgdatadir_mapping.rs
Normal file
1173
pageserver/src/pgdatadir_mapping.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -641,7 +641,7 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
harness: &RepoHarness,
|
||||
harness: &RepoHarness<'_>,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
|
||||
@@ -881,7 +881,7 @@ mod test_utils {
|
||||
|
||||
#[track_caller]
|
||||
pub async fn ensure_correct_timeline_upload(
|
||||
harness: &RepoHarness,
|
||||
harness: &RepoHarness<'_>,
|
||||
remote_assets: Arc<(LocalFs, RwLock<RemoteTimelineIndex>)>,
|
||||
timeline_id: ZTimelineId,
|
||||
new_upload: NewCheckpoint,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,14 +2,15 @@
|
||||
//! page server.
|
||||
|
||||
use crate::branches;
|
||||
use crate::{RepositoryImpl, TimelineImpl};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::repository::{Repository, TimelineSyncState};
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::TimelineSyncState;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{DatadirTimelineImpl, RepositoryImpl};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
@@ -26,6 +27,8 @@ lazy_static! {
|
||||
struct Tenant {
|
||||
state: TenantState,
|
||||
repo: Arc<RepositoryImpl>,
|
||||
|
||||
timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -79,15 +82,17 @@ pub fn set_timeline_states(
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo: Arc<RepositoryImpl> = Arc::new(LayeredRepository::new(
|
||||
let repo = LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
);
|
||||
|
||||
Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
repo: Arc::new(repo),
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
});
|
||||
if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) {
|
||||
@@ -191,6 +196,7 @@ pub fn create_repository_for_tenant(
|
||||
v.insert(Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
timelines: HashMap::new(),
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -261,11 +267,25 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryIm
|
||||
pub fn get_timeline_for_tenant(
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<TimelineImpl>> {
|
||||
get_repository_for_tenant(tenantid)?
|
||||
) -> Result<Arc<DatadirTimelineImpl>> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m
|
||||
.get_mut(&tenantid)
|
||||
.with_context(|| format!("Tenant not found for tenant {}", tenantid))?;
|
||||
|
||||
if let Some(page_tline) = tenant.timelines.get(&timelineid) {
|
||||
return Ok(Arc::clone(page_tline));
|
||||
}
|
||||
// First access to this timeline. Create a DatadirTimeline wrapper for it
|
||||
let tline = tenant
|
||||
.repo
|
||||
.get_timeline(timelineid)?
|
||||
.local_timeline()
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))?;
|
||||
|
||||
let page_tline = Arc::new(DatadirTimelineImpl::new(tline));
|
||||
tenant.timelines.insert(timelineid, Arc::clone(&page_tline));
|
||||
Ok(page_tline)
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
|
||||
@@ -14,6 +14,15 @@ use zenith_utils::zid::ZTenantId;
|
||||
/// Checkpointer thread's main loop
|
||||
///
|
||||
pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
if let Err(err) = checkpoint_loop_ext(tenantid, conf) {
|
||||
error!("checkpoint loop terminated with error: {:?}", err);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn checkpoint_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,7 +7,6 @@
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -255,8 +254,7 @@ fn walreceiver_main(
|
||||
// at risk of hitting a deadlock.
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let writer = timeline.writer();
|
||||
walingest.ingest_record(&*timeline, writer.as_ref(), recdata, lsn)?;
|
||||
walingest.ingest_record(&timeline, recdata, lsn)?;
|
||||
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
|
||||
@@ -10,7 +10,47 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, Transacti
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
|
||||
use crate::repository::ZenithWalRecord;
|
||||
/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom zenith-specific "record".
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum ZenithWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted { xids: Vec<TransactionId> },
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
MultixactOffsetCreate {
|
||||
mid: MultiXactId,
|
||||
moff: MultiXactOffset,
|
||||
},
|
||||
/// Extend multixact members SLRU.
|
||||
MultixactMembersCreate {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
}
|
||||
|
||||
impl ZenithWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
match self {
|
||||
ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
|
||||
// None of the special zenith record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// DecodedBkpBlock represents per-page data contained in a WAL record.
|
||||
#[derive(Default)]
|
||||
@@ -87,6 +127,28 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrCreate {
|
||||
pub rnode: RelFileNode,
|
||||
// FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have
|
||||
// well-defined size?
|
||||
pub forknum: u8,
|
||||
}
|
||||
|
||||
impl XlSmgrCreate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlSmgrCreate {
|
||||
XlSmgrCreate {
|
||||
rnode: RelFileNode {
|
||||
spcnode: buf.get_u32_le(), /* tablespace */
|
||||
dbnode: buf.get_u32_le(), /* database */
|
||||
relnode: buf.get_u32_le(), /* relation */
|
||||
},
|
||||
forknum: buf.get_u32_le() as u8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrTruncate {
|
||||
|
||||
@@ -42,8 +42,10 @@ use zenith_utils::nonblock::set_nonblock;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::relish::*;
|
||||
use crate::repository::ZenithWalRecord;
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
|
||||
@@ -75,8 +77,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
@@ -92,8 +93,7 @@ pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_rel: RelishTag,
|
||||
_blknum: u32,
|
||||
_key: Key,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
@@ -152,28 +152,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool {
|
||||
if let RelishTag::Relation(RelTag {
|
||||
forknum,
|
||||
spcnode: _,
|
||||
dbnode: _,
|
||||
relnode: _,
|
||||
}) = rel
|
||||
{
|
||||
*forknum == expected_forknum
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool {
|
||||
if let RelishTag::Slru { slru, segno } = rel {
|
||||
*slru == expected_slru && *segno == expected_segno
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum WalRedoError {
|
||||
@@ -184,6 +162,8 @@ pub enum WalRedoError {
|
||||
InvalidState,
|
||||
#[error("cannot perform WAL redo for this request")]
|
||||
InvalidRequest,
|
||||
#[error("cannot perform WAL redo for this record")]
|
||||
InvalidRecord,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -198,8 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
@@ -217,11 +196,10 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
|
||||
if rec_zenith != batch_zenith {
|
||||
let result = if batch_zenith {
|
||||
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i])
|
||||
self.apply_batch_zenith(key, lsn, img, &records[batch_start..i])
|
||||
} else {
|
||||
self.apply_batch_postgres(
|
||||
rel,
|
||||
blknum,
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
&records[batch_start..i],
|
||||
@@ -236,11 +214,10 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
}
|
||||
// last batch
|
||||
if batch_zenith {
|
||||
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..])
|
||||
self.apply_batch_zenith(key, lsn, img, &records[batch_start..])
|
||||
} else {
|
||||
self.apply_batch_postgres(
|
||||
rel,
|
||||
blknum,
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
&records[batch_start..],
|
||||
@@ -268,16 +245,15 @@ impl PostgresRedoManager {
|
||||
///
|
||||
fn apply_batch_postgres(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, ZenithWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let start_time = Instant::now();
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut process_guard = self.process.lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
@@ -291,16 +267,11 @@ impl PostgresRedoManager {
|
||||
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
|
||||
let result = if let RelishTag::Relation(rel) = rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout);
|
||||
|
||||
apply_result.map_err(WalRedoError::IoError)
|
||||
} else {
|
||||
error!("unexpected non-relation relish: {:?}", rel);
|
||||
Err(WalRedoError::InvalidRequest)
|
||||
};
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
let result = process
|
||||
.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
|
||||
.map_err(WalRedoError::IoError);
|
||||
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
@@ -326,8 +297,7 @@ impl PostgresRedoManager {
|
||||
///
|
||||
fn apply_batch_zenith(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, ZenithWalRecord)],
|
||||
@@ -346,7 +316,7 @@ impl PostgresRedoManager {
|
||||
|
||||
// Apply all the WAL records in the batch
|
||||
for (record_lsn, record) in records.iter() {
|
||||
self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?;
|
||||
self.apply_record_zenith(key, &mut page, *record_lsn, record)?;
|
||||
}
|
||||
// Success!
|
||||
let end_time = Instant::now();
|
||||
@@ -365,8 +335,7 @@ impl PostgresRedoManager {
|
||||
|
||||
fn apply_record_zenith(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
page: &mut BytesMut,
|
||||
_record_lsn: Lsn,
|
||||
record: &ZenithWalRecord,
|
||||
@@ -382,9 +351,10 @@ impl PostgresRedoManager {
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relish
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert!(
|
||||
check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
"ClearVisibilityMapFlags record on unexpected rel {:?}",
|
||||
rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
if let Some(heap_blkno) = *new_heap_blkno {
|
||||
@@ -418,6 +388,14 @@ impl PostgresRedoManager {
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
ZenithWalRecord::ClogSetCommitted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetCommitted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -425,12 +403,17 @@ impl PostgresRedoManager {
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::Clog, expected_segno),
|
||||
"ClogSetCommitted record for XID {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
transaction_id_set_status(
|
||||
xid,
|
||||
@@ -440,6 +423,14 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
ZenithWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetAborted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -447,17 +438,30 @@ impl PostgresRedoManager {
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::Clog, expected_segno),
|
||||
"ClogSetCommitted record for XID {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
|
||||
}
|
||||
}
|
||||
ZenithWalRecord::MultixactOffsetCreate { mid, moff } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
"MultixactOffsetCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
@@ -468,16 +472,29 @@ impl PostgresRedoManager {
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno),
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
}
|
||||
ZenithWalRecord::MultixactMembersCreate { moff, members } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
"MultixactMembersCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for (i, member) in members.iter().enumerate() {
|
||||
let offset = moff + i as u32;
|
||||
|
||||
@@ -492,12 +509,17 @@ impl PostgresRedoManager {
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno),
|
||||
"MultiXactMembersCreate record at offset {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
|
||||
|
||||
@@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const XLOG_SMGR_CREATE: u8 = 0x10;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
|
||||
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
@@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
|
||||
pub const XLOG_FPI: u8 = 0xB0;
|
||||
pub const DB_SHUTDOWNED: u32 = 1;
|
||||
|
||||
@@ -74,8 +74,5 @@ def lsn_from_hex(lsn_hex: str) -> int:
|
||||
def print_gc_result(row):
|
||||
log.info("GC duration {elapsed} ms".format_map(row))
|
||||
log.info(
|
||||
" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
|
||||
.format_map(row))
|
||||
log.info(
|
||||
" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
|
||||
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
|
||||
.format_map(row))
|
||||
|
||||
Reference in New Issue
Block a user