Major storage format rewrite

Major changes and new concepts:

Simplify Repository to a value-store
------------------------------------

Move the responsibility of tracking relation metadata, like which relations
exist and what are their sizes, from Repository to a new module,
pgdatadir_mapping.rs. The interface to Repository is now a simple key-value
PUT/GET operations.

It's still not any old key-value store though. A Repository is still
responsible from handling branching, and every GET operation comes with
an LSN.

Key
---

The key to the Repository key-value store is a Key struct, which consists
of a few integer fields. It's wide enough to store a full RelFileNode,
fork and block number, and to distinguish those from metadata keys.

See pgdatadir_mapping.rs for how relation blocks and metadata keys are
mapped to the Key struct.

Store arbitrary key-ranges in the layer files
---------------------------------------------

The concept of a "segment" is gone. Each layer file can store an arbitrary
range of Keys.

TODO:

- Deleting keys, to reclaim space. This isn't visible to Postgres, dropping
  or truncating a relation works as you would expect if you look at it from
  the compute node. If you drop a relation, for example, the relation is
  removed from the metadata entry, so that it appears to be gone. However,
  the layered repository implementation never reclaims the storage.

- Tracking "logical database size", for disk space quotas. That ought to
  be reimplemented now in pgdatadir_mapping.rs, or perhaps in walingest.rs.

- LSM compaction. The logic for checkpointing and creating image layers is
  very dumb. AFAIK the *read* code could deal with a full-fledged LSM tree
  now consisting of the delta and image layers. But there's no code to
  take a bunch of delta layers and compact them, and the heuristics for
  when to create image layers is pretty dumb.

- The code to track the layers is inefficient. All layers are just stored in
  a vector, and whenever we need to find a layer, we do a linear search in
  it.
This commit is contained in:
Heikki Linnakangas
2022-03-07 14:04:46 +02:00
parent c7c1e19667
commit 6127b6638b
30 changed files with 4483 additions and 4828 deletions

1
Cargo.lock generated
View File

@@ -1307,6 +1307,7 @@ dependencies = [
"hex-literal",
"humantime",
"hyper",
"itertools",
"lazy_static",
"log",
"nix",

View File

@@ -12,6 +12,7 @@ bytes = { version = "1.0.1", features = ['serde'] }
byteorder = "1.4.3"
futures = "0.3.13"
hyper = "0.14"
itertools = "0.10.3"
lazy_static = "1.4.0"
log = "0.4.14"
clap = "3.0"

View File

@@ -22,6 +22,7 @@ use tar::{Builder, EntryType, Header};
use crate::relish::*;
use crate::repository::Timeline;
use crate::DatadirTimelineImpl;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::*;
use zenith_utils::lsn::Lsn;
@@ -29,9 +30,9 @@ use zenith_utils::lsn::Lsn;
/// This is short-living object only for the time of tarball creation,
/// created mostly to avoid passing a lot of parameters between various functions
/// used for constructing tarball.
pub struct Basebackup<'a, T> {
pub struct Basebackup<'a> {
ar: Builder<&'a mut dyn Write>,
timeline: &'a Arc<T>,
timeline: &'a Arc<DatadirTimelineImpl>,
pub lsn: Lsn,
prev_record_lsn: Lsn,
}
@@ -43,14 +44,12 @@ pub struct Basebackup<'a, T> {
// * When working without safekeepers. In this situation it is important to match the lsn
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
// to start the replication.
impl<'a, T> Basebackup<'a, T>
where T: Timeline,
{
impl<'a> Basebackup<'a> {
pub fn new(
write: &'a mut dyn Write,
timeline: &'a Arc<T>,
timeline: &'a Arc<DatadirTimelineImpl>,
req_lsn: Option<Lsn>,
) -> Result<Basebackup<'a, T>> {
) -> Result<Basebackup<'a>> {
// Compute postgres doesn't have any previous WAL files, but the first
// record that it's going to write needs to include the LSN of the
// previous record (xl_prev). We include prev_record_lsn in the
@@ -66,7 +65,7 @@ where T: Timeline,
// prev_lsn to Lsn(0) if we cannot provide the correct value.
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
timeline.wait_lsn(req_lsn)?;
timeline.tline.wait_lsn(req_lsn)?;
// If the requested point is the end of the timeline, we can
// provide prev_lsn. (get_last_record_rlsn() might return it as
@@ -117,20 +116,21 @@ where T: Timeline,
}
// Gather non-relational files from object storage pages.
for obj in self.timeline.list_nonrels(self.lsn)? {
match obj {
RelishTag::Slru { slru, segno } => {
self.add_slru_segment(slru, segno)?;
}
RelishTag::FileNodeMap { spcnode, dbnode } => {
self.add_relmap_file(spcnode, dbnode)?;
}
RelishTag::TwoPhase { xid } => {
self.add_twophase_file(xid)?;
}
_ => {}
for kind in [
SlruKind::Clog,
SlruKind::MultiXactOffsets,
SlruKind::MultiXactMembers,
] {
for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
self.add_slru_segment(kind, segno)?;
}
}
for (spcnode, dbnode) in self.timeline.list_relmap_files(self.lsn)? {
self.add_relmap_file(spcnode, dbnode)?;
}
for xid in self.timeline.list_twophase_files(self.lsn)? {
self.add_twophase_file(xid)?;
}
// Generate pg_control and bootstrap WAL segment.
self.add_pgcontrol_file()?;
@@ -143,27 +143,14 @@ where T: Timeline,
// Generate SLRU segment files from repository.
//
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let seg_size = self
.timeline
.get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?;
if seg_size == None {
trace!(
"SLRU segment {}/{:>04X} was truncated",
slru.to_str(),
segno
);
return Ok(());
}
let nblocks = seg_size.unwrap();
let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
let mut slru_buf: Vec<u8> =
Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
for blknum in 0..nblocks {
let img =
self.timeline
.get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?;
let img = self
.timeline
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
assert!(img.len() == pg_constants::BLCKSZ as usize);
slru_buf.extend_from_slice(&img);
@@ -182,11 +169,7 @@ where T: Timeline,
// Along with them also send PG_VERSION for each database.
//
fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
let img = self.timeline.get_page_at_lsn(
RelishTag::FileNodeMap { spcnode, dbnode },
0,
self.lsn,
)?;
let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
@@ -222,9 +205,7 @@ where T: Timeline,
// Extract twophase state files
//
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
let img = self
.timeline
.get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
let img = self.timeline.get_twophase_file(xid, self.lsn)?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
@@ -244,11 +225,11 @@ where T: Timeline,
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
let checkpoint_bytes = self
.timeline
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
.get_checkpoint(self.lsn)
.context("failed to get checkpoint bytes")?;
let pg_control_bytes = self
.timeline
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
.get_control_file(self.lsn)
.context("failed get control bytes")?;
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
@@ -269,7 +250,7 @@ where T: Timeline,
// add zenith.signal file
let mut zenith_signal = String::new();
if self.prev_record_lsn == Lsn(0) {
if self.lsn == self.timeline.get_ancestor_lsn() {
if self.lsn == self.timeline.tline.get_ancestor_lsn() {
write!(zenith_signal, "PREV LSN: none")?;
} else {
write!(zenith_signal, "PREV LSN: invalid")?;

View File

@@ -20,12 +20,14 @@ use zenith_utils::lsn::Lsn;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use zenith_utils::{crashsafe_dir, logging};
use crate::config::PageServerConf;
use crate::pgdatadir_mapping::DatadirTimeline;
use crate::repository::{Repository, Timeline};
use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
use crate::{config::PageServerConf, repository::Repository};
use crate::RepositoryImpl;
use crate::{import_datadir, LOG_FILE_NAME};
use crate::{repository::RepositoryTimeline, tenant_mgr};
use crate::repository::Timeline;
#[derive(Serialize, Deserialize, Clone)]
pub struct BranchInfo {
@@ -40,10 +42,10 @@ pub struct BranchInfo {
}
impl BranchInfo {
pub fn from_path<R: Repository, P: AsRef<Path>>(
path: P,
pub fn from_path<R: Repository, T: AsRef<Path>>(
path: T,
repo: &R,
include_non_incremental_logical_size: bool,
_include_non_incremental_logical_size: bool,
) -> Result<Self> {
let path = path.as_ref();
let name = path.file_name().unwrap().to_string_lossy().to_string();
@@ -74,11 +76,17 @@ impl BranchInfo {
// non incremental size calculation can be heavy, so let it be optional
// needed for tests to check size calculation
//
// FIXME
/*
let current_logical_size_non_incremental = include_non_incremental_logical_size
.then(|| {
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
})
.transpose()?;
*/
let current_logical_size_non_incremental = Some(0);
let current_logical_size = 0;
Ok(BranchInfo {
name,
@@ -86,7 +94,7 @@ impl BranchInfo {
latest_valid_lsn: timeline.get_last_record_lsn(),
ancestor_id,
ancestor_lsn,
current_logical_size: timeline.get_current_logical_size(),
current_logical_size, // : timeline.get_current_logical_size(),
current_logical_size_non_incremental,
})
}
@@ -130,7 +138,7 @@ pub fn create_repo(
conf: &'static PageServerConf,
tenantid: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
) -> Result<Arc<crate::layered_repository::LayeredRepository>> {
) -> Result<Arc<RepositoryImpl>> {
let repo_dir = conf.tenant_path(&tenantid);
if repo_dir.exists() {
bail!("repo for {} already exists", tenantid)
@@ -152,19 +160,19 @@ pub fn create_repo(
crashsafe_dir::create_dir(&timelinedir)?;
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
let repo = crate::layered_repository::LayeredRepository::new(
conf,
wal_redo_manager,
tenantid,
conf.remote_storage_config.is_some(),
));
);
// Load data into pageserver
// TODO To implement zenith import we need to
// move data loading out of create_repo()
bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
bootstrap_timeline(conf, tenantid, timeline_id, &repo)?;
Ok(repo)
Ok(Arc::new(repo))
}
// Returns checkpoint LSN from controlfile
@@ -233,17 +241,16 @@ fn bootstrap_timeline<R: Repository>(
// Initdb lsn will be equal to last_record_lsn which will be set after import.
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
let timeline = repo.create_empty_timeline(tli, lsn)?;
import_datadir::import_timeline_from_postgres_datadir(
&pgdata_path,
&*timeline,
lsn,
)?;
timeline.checkpoint(CheckpointConfig::Forced)?;
let page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline);
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &page_tline, lsn)?;
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
println!(
"created initial timeline {} timeline.lsn {}",
tli,
timeline.get_last_record_lsn()
page_tline.tline.get_last_record_lsn()
);
let data = tli.to_string();

View File

@@ -26,8 +26,9 @@ use super::models::BranchCreateRequest;
use super::models::StatusResponse;
use super::models::TenantCreateRequest;
use crate::branches::BranchInfo;
use crate::repository::{Repository, RepositoryTimeline, Timeline};
use crate::repository::RepositoryTimeline;
use crate::repository::TimelineSyncState;
use crate::repository::{Repository, Timeline};
use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId};
#[derive(Debug)]

View File

@@ -11,14 +11,15 @@ use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use tracing::*;
use crate::pgdatadir_mapping::*;
use crate::relish::*;
use crate::repository::*;
use crate::repository::Repository;
use crate::walingest::WalIngest;
use postgres_ffi::relfile_utils::*;
use postgres_ffi::waldecoder::*;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::Oid;
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
use postgres_ffi::{Oid, TransactionId};
use zenith_utils::lsn::Lsn;
///
@@ -27,45 +28,43 @@ use zenith_utils::lsn::Lsn;
/// This is currently only used to import a cluster freshly created by initdb.
/// The code that deals with the checkpoint would not work right if the
/// cluster was not shut down cleanly.
pub fn import_timeline_from_postgres_datadir<T: Timeline>(
pub fn import_timeline_from_postgres_datadir<R: Repository>(
path: &Path,
timeline: &T,
tline: &DatadirTimeline<R>,
lsn: Lsn,
) -> Result<()> {
let mut pg_control: Option<ControlFileData> = None;
let writer_box = timeline.writer();
let writer = writer_box.as_ref();
let mut writer = tline.begin_record(lsn);
writer.init_empty()?;
// Scan 'global'
let mut relfiles: Vec<PathBuf> = Vec::new();
writer.put_dbdir_creation(pg_constants::GLOBALTABLESPACE_OID, 0)?;
for direntry in fs::read_dir(path.join("global"))? {
let direntry = direntry?;
match direntry.file_name().to_str() {
None => continue,
Some("pg_control") => {
pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
pg_control = Some(import_control_file(&mut writer, &direntry.path())?);
}
Some("pg_filenode.map") => {
import_relmap_file(
&mut writer,
pg_constants::GLOBALTABLESPACE_OID,
0,
&direntry.path(),
)?;
}
Some("pg_filenode.map") => import_nonrel_file(
writer,
lsn,
RelishTag::FileNodeMap {
spcnode: pg_constants::GLOBALTABLESPACE_OID,
dbnode: 0,
},
&direntry.path(),
)?,
// Load any relation files into the page server
_ => import_relfile(
&direntry.path(),
writer,
lsn,
pg_constants::GLOBALTABLESPACE_OID,
0,
)?,
// Load any relation files into the page server (but only after the other files)
_ => relfiles.push(direntry.path()),
}
}
for relfile in relfiles {
import_relfile(&mut writer, &relfile, pg_constants::GLOBALTABLESPACE_OID, 0)?;
}
// Scan 'base'. It contains database dirs, the database OID is the filename.
// E.g. 'base/12345', where 12345 is the database OID.
@@ -79,54 +78,56 @@ pub fn import_timeline_from_postgres_datadir<T: Timeline>(
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
let mut relfiles: Vec<PathBuf> = Vec::new();
for direntry in fs::read_dir(direntry.path())? {
let direntry = direntry?;
match direntry.file_name().to_str() {
None => continue,
Some("PG_VERSION") => continue,
Some("pg_filenode.map") => import_nonrel_file(
writer,
lsn,
RelishTag::FileNodeMap {
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
dbnode: dboid,
},
Some("PG_VERSION") => {
writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
}
Some("pg_filenode.map") => import_relmap_file(
&mut writer,
pg_constants::DEFAULTTABLESPACE_OID,
dboid,
&direntry.path(),
)?,
// Load any relation files into the page server
_ => import_relfile(
&direntry.path(),
writer,
lsn,
pg_constants::DEFAULTTABLESPACE_OID,
dboid,
)?,
_ => relfiles.push(direntry.path()),
}
}
for relfile in relfiles {
import_relfile(
&mut writer,
&relfile,
pg_constants::DEFAULTTABLESPACE_OID,
dboid,
)?;
}
}
for entry in fs::read_dir(path.join("pg_xact"))? {
let entry = entry?;
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
import_slru_file(&mut writer, SlruKind::Clog, &entry.path())?;
}
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
let entry = entry?;
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
import_slru_file(&mut writer, SlruKind::MultiXactMembers, &entry.path())?;
}
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
let entry = entry?;
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
import_slru_file(&mut writer, SlruKind::MultiXactOffsets, &entry.path())?;
}
for entry in fs::read_dir(path.join("pg_twophase"))? {
let entry = entry?;
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
import_twophase_file(&mut writer, xid, &entry.path())?;
}
// TODO: Scan pg_tblspc
// We're done importing all the data files.
writer.advance_last_record_lsn(lsn);
writer.finish()?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -144,8 +145,7 @@ pub fn import_timeline_from_postgres_datadir<T: Timeline>(
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
import_wal(
&path.join("pg_wal"),
timeline,
writer,
tline,
Lsn(pg_control.checkPointCopy.redo),
lsn,
)?;
@@ -154,10 +154,9 @@ pub fn import_timeline_from_postgres_datadir<T: Timeline>(
}
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
fn import_relfile(
fn import_relfile<R: Repository>(
timeline: &mut DatadirTimelineWriter<R>,
path: &Path,
timeline: &dyn TimelineWriter,
lsn: Lsn,
spcoid: Oid,
dboid: Oid,
) -> Result<()> {
@@ -174,19 +173,28 @@ fn import_relfile(
let mut file = File::open(path)?;
let mut buf: [u8; 8192] = [0u8; 8192];
let len = file.metadata().unwrap().len();
ensure!(len % pg_constants::BLCKSZ as u64 == 0);
let nblocks = len / pg_constants::BLCKSZ as u64;
if segno != 0 {
todo!();
}
let rel = RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode,
forknum,
};
timeline.put_rel_creation(rel, nblocks as u32)?;
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
loop {
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
let rel = RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode,
forknum,
};
let tag = RelishTag::Relation(rel);
timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?;
timeline.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
}
// TODO: UnexpectedEof is expected
@@ -203,20 +211,37 @@ fn import_relfile(
};
blknum += 1;
}
ensure!(blknum == nblocks as u32);
Ok(())
}
///
/// FIXME
/// Import a "non-blocky" file into the repository
///
/// This is used for small files like the control file, twophase files etc. that
/// are just slurped into the repository as one blob.
///
fn import_nonrel_file(
timeline: &dyn TimelineWriter,
lsn: Lsn,
tag: RelishTag,
fn import_relmap_file<R: Repository>(
timeline: &mut DatadirTimelineWriter<R>,
spcnode: Oid,
dbnode: Oid,
path: &Path,
) -> Result<()> {
let mut file = File::open(path)?;
let mut buffer = Vec::new();
// read the whole file
file.read_to_end(&mut buffer)?;
trace!("importing relmap file {}", path.display());
timeline.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
Ok(())
}
fn import_twophase_file<R: Repository>(
timeline: &mut DatadirTimelineWriter<R>,
xid: TransactionId,
path: &Path,
) -> Result<()> {
let mut file = File::open(path)?;
@@ -226,7 +251,7 @@ fn import_nonrel_file(
trace!("importing non-rel file {}", path.display());
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
timeline.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
Ok(())
}
@@ -235,9 +260,8 @@ fn import_nonrel_file(
///
/// The control file is imported as is, but we also extract the checkpoint record
/// from it and store it separated.
fn import_control_file(
timeline: &dyn TimelineWriter,
lsn: Lsn,
fn import_control_file<R: Repository>(
timeline: &mut DatadirTimelineWriter<R>,
path: &Path,
) -> Result<ControlFileData> {
let mut file = File::open(path)?;
@@ -248,17 +272,12 @@ fn import_control_file(
trace!("importing control file {}", path.display());
// Import it as ControlFile
timeline.put_page_image(
RelishTag::ControlFile,
0,
lsn,
Bytes::copy_from_slice(&buffer[..]),
)?;
timeline.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
// Extract the checkpoint record and import it separately.
let pg_control = ControlFileData::decode(&buffer)?;
let checkpoint_bytes = pg_control.checkPointCopy.encode();
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
timeline.put_checkpoint(checkpoint_bytes)?;
Ok(pg_control)
}
@@ -266,30 +285,31 @@ fn import_control_file(
///
/// Import an SLRU segment file
///
fn import_slru_file(
timeline: &dyn TimelineWriter,
lsn: Lsn,
fn import_slru_file<R: Repository>(
timeline: &mut DatadirTimelineWriter<R>,
slru: SlruKind,
path: &Path,
) -> Result<()> {
// Does it look like an SLRU file?
trace!("importing slru file {}", path.display());
let mut file = File::open(path)?;
let mut buf: [u8; 8192] = [0u8; 8192];
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
trace!("importing slru file {}", path.display());
let len = file.metadata().unwrap().len();
ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
let nblocks = len / pg_constants::BLCKSZ as u64;
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
timeline.put_slru_segment_creation(slru, segno, nblocks as u32)?;
let mut rpageno = 0;
loop {
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
timeline.put_page_image(
RelishTag::Slru { slru, segno },
rpageno,
lsn,
Bytes::copy_from_slice(&buf),
)?;
timeline.put_slru_page_image(slru, segno, rpageno, Bytes::copy_from_slice(&buf))?;
}
// TODO: UnexpectedEof is expected
@@ -305,19 +325,17 @@ fn import_slru_file(
},
};
rpageno += 1;
// TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages
}
ensure!(rpageno == nblocks as u32);
Ok(())
}
/// Scan PostgreSQL WAL files in given directory and load all records between
/// 'startpoint' and 'endpoint' into the repository.
fn import_wal<T: Timeline>(
fn import_wal<R: Repository>(
walpath: &Path,
timeline: &T,
writer: &dyn TimelineWriter,
tline: &DatadirTimeline<R>,
startpoint: Lsn,
endpoint: Lsn,
) -> Result<()> {
@@ -327,7 +345,7 @@ fn import_wal<T: Timeline>(
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
let mut last_lsn = startpoint;
let mut walingest = WalIngest::new(timeline, startpoint)?;
let mut walingest = WalIngest::new(tline, startpoint)?;
while last_lsn <= endpoint {
// FIXME: assume postgresql tli 1 for now
@@ -360,7 +378,7 @@ fn import_wal<T: Timeline>(
let mut nrecords = 0;
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(timeline, writer, recdata, lsn)?;
walingest.ingest_record(tline, recdata, lsn)?;
last_lsn = lsn;
nrecords += 1;

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,5 @@
//!
//! A DeltaLayer represents a collection of WAL records or page images in a range of
//! LSNs, for one segment. It is stored on a file on disk.
//! LSNs, and in a range of Keys. It is stored on a file on disk.
//!
//! Usually a delta layer only contains differences - in the form of WAL records against
//! a base LSN. However, if a segment is newly created, by creating a new relation or
@@ -11,56 +10,53 @@
//! can happen when you create a new branch in the middle of a delta layer, and the WAL
//! records on the new branch are put in a new delta layer.
//!
//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters
//! When a delta file needs to be accessed, we slurp the 'index' metadata
//! into memory, into the DeltaLayerInner struct. See load() and unload() functions.
//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN.
//! The byte ranges in the metadata can be used to find the page/WAL record in
//! PAGE_VERSIONS_CHAPTER.
//! To access a particular value, we search `index` for the given key.
//! The byte offset in the index can be used to find the value in
//! VALUES_CHAPTER.
//!
//! On disk, the delta files are stored in timelines/<timelineid> directory.
//! Currently, there are no subdirectories, and each delta file is named like this:
//!
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
//! <key start>-<key end>__<start LSN>-<end LSN
//!
//! For example:
//!
//! 1663_13990_2609_0_5_000000000169C348_000000000169C349
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
//!
//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
//! So the above example would become:
//!
//! 1663_13990_2609_0_5_000000000169C348_000000000169C349_DROPPED
//!
//! The end LSN indicates when it was dropped in that case, we don't store it in the
//! file contents in any way.
//!
//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two
//! parts: the page versions and the segment sizes. They are stored as separate chapters.
//! A delta file is constructed using the 'bookfile' crate. Each file consists of three
//! parts: the 'index', the values, and a short summary header. They are stored as
//! separate chapters.
//!
use crate::config::PageServerConf;
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
RELISH_SEG_SIZE,
Layer, ValueReconstructResult, ValueReconstructState,
};
use crate::layered_repository::utils;
use crate::repository::{Key, Value};
use crate::virtual_file::VirtualFile;
use crate::walrecord;
use crate::{ZTenantId, ZTimelineId};
use anyhow::{bail, ensure, Result};
use anyhow::{bail, Result};
use log::*;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use zenith_utils::vec_map::VecMap;
// avoid binding to Write (conflicts with std::io::Write)
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
use std::fs;
use std::io::{BufWriter, Write};
use std::ops::Bound::Included;
use std::io::BufWriter;
use std::io::Write;
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::{Mutex, MutexGuard};
use std::sync::{RwLock, RwLockReadGuard};
use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter};
use bookfile::{Book, BookWriter, ChapterWriter};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
@@ -68,27 +64,23 @@ use zenith_utils::lsn::Lsn;
// Magic constant to identify a Zenith delta file
pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
/// Mapping from (block #, lsn) -> page/WAL record
/// byte ranges in PAGE_VERSIONS_CHAPTER
static PAGE_VERSION_METAS_CHAPTER: u64 = 1;
/// Mapping from (key, lsn) -> page/WAL record
/// byte ranges in VALUES_CHAPTER
static INDEX_CHAPTER: u64 = 1;
/// Page/WAL bytes - cannot be interpreted
/// without PAGE_VERSION_METAS_CHAPTER
static PAGE_VERSIONS_CHAPTER: u64 = 2;
static SEG_SIZES_CHAPTER: u64 = 3;
/// without the page versions from the INDEX_CHAPTER
static VALUES_CHAPTER: u64 = 2;
/// Contains the [`Summary`] struct
static SUMMARY_CHAPTER: u64 = 4;
static SUMMARY_CHAPTER: u64 = 3;
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
struct Summary {
tenantid: ZTenantId,
timelineid: ZTimelineId,
seg: SegmentTag,
start_lsn: Lsn,
end_lsn: Lsn,
dropped: bool,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
}
impl From<&DeltaLayer> for Summary {
@@ -96,33 +88,17 @@ impl From<&DeltaLayer> for Summary {
Self {
tenantid: layer.tenantid,
timelineid: layer.timelineid,
seg: layer.seg,
start_lsn: layer.start_lsn,
end_lsn: layer.end_lsn,
dropped: layer.dropped,
key_range: layer.key_range.clone(),
lsn_range: layer.lsn_range.clone(),
}
}
}
#[derive(Serialize, Deserialize)]
struct BlobRange {
offset: u64,
size: usize,
}
fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
let mut buf = vec![0u8; range.size];
reader.read_exact_at(&mut buf, range.offset)?;
Ok(buf)
}
///
/// DeltaLayer is the in-memory data structure associated with an
/// on-disk delta file. We keep a DeltaLayer in memory for each
/// file, in the LayerMap. If a layer is in "loaded" state, we have a
/// copy of the file in memory, in 'inner'. Otherwise the struct is
/// copy of the index in memory, in 'inner'. Otherwise the struct is
/// just a placeholder for a file that exists on disk, and it needs to
/// be loaded before using it in queries.
///
@@ -131,47 +107,24 @@ pub struct DeltaLayer {
pub tenantid: ZTenantId,
pub timelineid: ZTimelineId,
pub seg: SegmentTag,
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
//
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
// start is inclusive, and end is exclusive.
//
pub start_lsn: Lsn,
pub end_lsn: Lsn,
dropped: bool,
inner: Mutex<DeltaLayerInner>,
inner: RwLock<DeltaLayerInner>,
}
pub struct DeltaLayerInner {
/// If false, the 'page_version_metas' and 'seg_sizes' have not been
/// loaded into memory yet.
/// If false, the 'index' has not been loaded into memory yet.
loaded: bool,
///
/// All versions of all pages in the layer are kept here.
/// Indexed by block number and LSN. The value is an offset into the
/// chapter where the page version is stored.
///
index: HashMap<Key, VecMap<Lsn, u64>>,
book: Option<Book<VirtualFile>>,
/// All versions of all pages in the file are are kept here.
/// Indexed by block number and LSN.
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
/// `seg_sizes` tracks the size of the segment at different points in time.
seg_sizes: VecMap<Lsn, SegmentBlk>,
}
impl DeltaLayerInner {
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
// Scan the VecMap backwards, starting from the given entry.
let slice = self
.seg_sizes
.slice_range((Included(&Lsn(0)), Included(&lsn)));
if let Some((_entry_lsn, entry)) = slice.last() {
Ok(*entry)
} else {
bail!("could not find seg size in delta layer")
}
}
}
impl Layer for DeltaLayer {
@@ -183,40 +136,31 @@ impl Layer for DeltaLayer {
self.timelineid
}
fn get_seg_tag(&self) -> SegmentTag {
self.seg
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn is_dropped(&self) -> bool {
self.dropped
}
fn get_start_lsn(&self) -> Lsn {
self.start_lsn
}
fn get_end_lsn(&self) -> Lsn {
self.end_lsn
fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
fn filename(&self) -> PathBuf {
PathBuf::from(self.layer_name().to_string())
}
/// Look up given page in the cache.
fn get_page_reconstruct_data(
fn get_value_reconstruct_data(
&self,
blknum: SegmentBlk,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<PageReconstructResult> {
lsn_floor: Lsn,
reconstruct_state: &mut ValueReconstructState,
) -> Result<ValueReconstructResult> {
let mut need_image = true;
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
assert!(self.key_range.contains(&reconstruct_state.key));
match &reconstruct_data.page_img {
Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => {
return Ok(PageReconstructResult::Complete)
match &reconstruct_state.img {
Some((cached_lsn, _)) if &self.lsn_range.end <= cached_lsn => {
reconstruct_state.lsn = *cached_lsn;
return Ok(ValueReconstructResult::Complete);
}
_ => {}
}
@@ -224,91 +168,74 @@ impl Layer for DeltaLayer {
{
// Open the file and lock the metadata in memory
let inner = self.load()?;
let page_version_reader = inner
let values_reader = inner
.book
.as_ref()
.expect("should be loaded in load call above")
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
.chapter_reader(VALUES_CHAPTER)?;
// Scan the metadata VecMap backwards, starting from the given entry.
let minkey = (blknum, Lsn(0));
let maxkey = (blknum, lsn);
let iter = inner
.page_version_metas
.slice_range((Included(&minkey), Included(&maxkey)))
.iter()
.rev();
for ((_blknum, pv_lsn), blob_range) in iter {
match &reconstruct_data.page_img {
Some((cached_lsn, _)) if pv_lsn <= cached_lsn => {
return Ok(PageReconstructResult::Complete)
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.index.get(&reconstruct_state.key) {
let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn);
for (entry_lsn, pos) in slice.iter().rev() {
match &reconstruct_state.img {
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
reconstruct_state.lsn = *cached_lsn;
return Ok(ValueReconstructResult::Complete);
}
_ => {}
}
_ => {}
}
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
match pv {
PageVersion::Page(img) => {
// Found a page image, return it
reconstruct_data.page_img = Some((*pv_lsn, img));
need_image = false;
break;
}
PageVersion::Wal(rec) => {
let will_init = rec.will_init();
reconstruct_data.records.push((*pv_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?;
match val {
Value::Image(img) => {
reconstruct_state.img = Some((*entry_lsn, img));
need_image = false;
break;
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((*entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
}
// If we didn't find any records for this, check if the request is beyond EOF
if need_image
&& reconstruct_data.records.is_empty()
&& self.seg.rel.is_blocky()
&& blknum >= inner.get_seg_size(lsn)?
{
return Ok(PageReconstructResult::Missing(self.start_lsn));
}
// release metadata lock and close the file
}
// If an older page image is needed to reconstruct the page, let the
// caller know.
if need_image {
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
reconstruct_state.lsn = Lsn(self.lsn_range.start.0 - 1);
Ok(ValueReconstructResult::Continue)
} else {
Ok(PageReconstructResult::Complete)
Ok(ValueReconstructResult::Complete)
}
}
/// Get size of the relation at given LSN
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
assert!(lsn >= self.start_lsn);
ensure!(
self.seg.rel.is_blocky(),
"get_seg_size() called on a non-blocky rel"
);
// Return a set of all distinct Keys present in this layer
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()> {
let inner = self.load()?;
inner.get_seg_size(lsn)
keys.extend(inner.index.keys().filter(|x| key_range.contains(x)));
Ok(())
}
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
// Is the requested LSN after the rel was dropped?
if self.dropped && lsn >= self.end_lsn {
return Ok(false);
}
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_> {
let inner = self.load().unwrap();
// Otherwise, it exists.
Ok(true)
let mut pairs: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
pairs.sort_by_key(|x| x.0);
match DeltaValueIter::new(inner) {
Ok(iter) => Box::new(iter),
Err(err) => Box::new(std::iter::once(Err(err)))
}
}
///
@@ -316,14 +243,14 @@ impl Layer for DeltaLayer {
/// it will need to be loaded back.
///
fn unload(&self) -> Result<()> {
let mut inner = self.inner.lock().unwrap();
inner.page_version_metas = VecMap::default();
inner.seg_sizes = VecMap::default();
inner.loaded = false;
if let Ok(mut inner) = self.inner.try_write() {
inner.index = HashMap::default();
inner.loaded = false;
// Note: we keep the Book open. Is that a good idea? The virtual file
// machinery has its own rules for closing the file descriptor if it's not
// needed, but the Book struct uses up some memory, too.
// Note: we keep the Book open. Is that a good idea? The virtual file
// machinery has its own rules for closing the file descriptor if it's not
// needed, but the Book struct uses up some memory, too.
}
Ok(())
}
@@ -345,45 +272,52 @@ impl Layer for DeltaLayer {
/// debugging function to print out the contents of the layer
fn dump(&self) -> Result<()> {
println!(
"----- delta layer for ten {} tli {} seg {} {}-{} ----",
self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.tenantid,
self.timelineid,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
println!("--- seg sizes ---");
let inner = self.load()?;
for (k, v) in inner.seg_sizes.as_slice() {
println!(" {}: {}", k, v);
}
println!("--- page versions ---");
let path = self.path();
let file = std::fs::File::open(&path)?;
let book = Book::new(file)?;
let chapter = book.chapter_reader(VALUES_CHAPTER)?;
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
let mut desc = String::new();
let mut values: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
values.sort_by_key(|k| k.0);
let buf = read_blob(&chapter, blob_range)?;
let pv = PageVersion::des(&buf)?;
for (key, versions) in values {
for (lsn, off) in versions.as_slice() {
let mut desc = String::new();
match pv {
PageVersion::Page(img) => {
write!(&mut desc, " img {} bytes", img.len())?;
}
PageVersion::Wal(rec) => {
let wal_desc = walrecord::describe_wal_record(&rec);
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
blob_range.size,
rec.will_init(),
wal_desc
)?;
let buf = utils::read_blob_from_chapter(&chapter, *off)?;
let val = Value::des(&buf);
match val {
Ok(Value::Image(img)) => {
write!(&mut desc, " img {} bytes", img.len())?;
}
Ok(Value::WalRecord(rec)) => {
let wal_desc = walrecord::describe_wal_record(&rec);
write!(
&mut desc,
" rec {} bytes will_init: {} {}",
buf.len(),
rec.will_init(),
wal_desc
)?;
}
Err(err) => {
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
}
}
println!(" key {} at {}: {}", key, lsn, desc);
}
println!(" blk {} at {}: {}", blk, lsn, desc);
}
Ok(())
@@ -408,61 +342,61 @@ impl DeltaLayer {
///
/// Load the contents of the file into memory
///
fn load(&self) -> Result<MutexGuard<DeltaLayerInner>> {
// quick exit if already loaded
let mut inner = self.inner.lock().unwrap();
fn load(&self) -> Result<RwLockReadGuard<DeltaLayerInner>> {
loop {
// quick exit if already loaded
{
let inner = self.inner.read().unwrap();
if inner.loaded {
return Ok(inner);
}
let path = self.path();
// Open the file if it's not open already.
if inner.book.is_none() {
let file = VirtualFile::open(&path)?;
inner.book = Some(Book::new(file)?);
}
let book = inner.book.as_ref().unwrap();
match &self.path_or_conf {
PathOrConf::Conf(_) => {
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
let actual_summary = Summary::des(&chapter)?;
let expected_summary = Summary::from(self);
if actual_summary != expected_summary {
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
if inner.loaded {
return Ok(inner);
}
}
PathOrConf::Path(path) => {
let actual_filename = Path::new(path.file_name().unwrap());
let expected_filename = self.filename();
// need to upgrade to write lock
let mut inner = self.inner.write().unwrap();
if actual_filename != expected_filename {
println!(
"warning: filename does not match what is expected from in-file summary"
);
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
let path = self.path();
// Open the file if it's not open already.
if inner.book.is_none() {
let file = VirtualFile::open(&path)?;
inner.book = Some(Book::new(file)?);
}
let book = inner.book.as_ref().unwrap();
match &self.path_or_conf {
PathOrConf::Conf(_) => {
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
let actual_summary = Summary::des(&chapter)?;
let expected_summary = Summary::from(self);
if actual_summary != expected_summary {
bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
}
}
PathOrConf::Path(path) => {
let actual_filename = Path::new(path.file_name().unwrap());
let expected_filename = self.filename();
if actual_filename != expected_filename {
println!(
"warning: filename does not match what is expected from in-file summary"
);
println!("actual: {:?}", actual_filename);
println!("expected: {:?}", expected_filename);
}
}
}
let chapter = book.read_chapter(INDEX_CHAPTER)?;
let index = HashMap::des(&chapter)?;
debug!("loaded from {}", &path.display());
inner.index = index;
inner.loaded = true;
}
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
let page_version_metas = VecMap::des(&chapter)?;
let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?;
let seg_sizes = VecMap::des(&chapter)?;
debug!("loaded from {}", &path.display());
inner.page_version_metas = page_version_metas;
inner.seg_sizes = seg_sizes;
inner.loaded = true;
Ok(inner)
}
/// Create a DeltaLayer struct representing an existing file on disk.
@@ -476,15 +410,12 @@ impl DeltaLayer {
path_or_conf: PathOrConf::Conf(conf),
timelineid,
tenantid,
seg: filename.seg,
start_lsn: filename.start_lsn,
end_lsn: filename.end_lsn,
dropped: filename.dropped,
inner: Mutex::new(DeltaLayerInner {
key_range: filename.key_range.clone(),
lsn_range: filename.lsn_range.clone(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
book: None,
page_version_metas: VecMap::default(),
seg_sizes: VecMap::default(),
index: HashMap::default(),
}),
}
}
@@ -494,7 +425,7 @@ impl DeltaLayer {
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
where
F: std::os::unix::prelude::FileExt,
F: FileExt,
{
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
let summary = Summary::des(&chapter)?;
@@ -503,25 +434,20 @@ impl DeltaLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
timelineid: summary.timelineid,
tenantid: summary.tenantid,
seg: summary.seg,
start_lsn: summary.start_lsn,
end_lsn: summary.end_lsn,
dropped: summary.dropped,
inner: Mutex::new(DeltaLayerInner {
key_range: summary.key_range,
lsn_range: summary.lsn_range,
inner: RwLock::new(DeltaLayerInner {
loaded: false,
book: None,
page_version_metas: VecMap::default(),
seg_sizes: VecMap::default(),
index: HashMap::default(),
}),
})
}
fn layer_name(&self) -> DeltaFileName {
DeltaFileName {
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn: self.end_lsn,
dropped: self.dropped,
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
}
}
@@ -542,24 +468,24 @@ impl DeltaLayer {
///
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
///
/// 2. Write the contents by calling `put_page_version` for every page
/// 2. Write the contents by calling `put_value` for every page
/// version to store in the layer.
///
/// 3. Call `finish`.
///
pub struct DeltaLayerWriter {
conf: &'static PageServerConf,
path: PathBuf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
start_lsn: Lsn,
end_lsn: Lsn,
dropped: bool,
page_version_writer: ChapterWriter<BufWriter<VirtualFile>>,
pv_offset: u64,
key_start: Key,
lsn_range: Range<Lsn>,
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
index: HashMap<Key, VecMap<Lsn, u64>>,
values_writer: ChapterWriter<BufWriter<VirtualFile>>,
end_offset: u64,
}
impl DeltaLayerWriter {
@@ -570,94 +496,85 @@ impl DeltaLayerWriter {
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
start_lsn: Lsn,
end_lsn: Lsn,
dropped: bool,
key_start: Key,
lsn_range: Range<Lsn>,
) -> Result<DeltaLayerWriter> {
// Create the file
//
// Note: This overwrites any existing file. There shouldn't be any.
// FIXME: throw an error instead?
let path = DeltaLayer::path_for(
&PathOrConf::Conf(conf),
timelineid,
tenantid,
&DeltaFileName {
seg,
start_lsn,
end_lsn,
dropped,
},
);
let path = conf
.timeline_path(&timelineid, &tenantid)
.join(format!("{}-XXX__{:016X}-{:016X}.temp",
key_start,
u64::from(lsn_range.start),
u64::from(lsn_range.end)));
info!("temp deltalayer path {}", path.display());
let file = VirtualFile::create(&path)?;
let buf_writer = BufWriter::new(file);
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
// Open the page-versions chapter for writing. The calls to
// `put_page_version` will use this to write the contents.
let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER);
// `put_value` will use this to write the contents.
let values_writer = book.new_chapter(VALUES_CHAPTER);
Ok(DeltaLayerWriter {
conf,
path,
timelineid,
tenantid,
seg,
start_lsn,
end_lsn,
dropped,
page_version_writer,
page_version_metas: VecMap::default(),
pv_offset: 0,
key_start,
lsn_range,
index: HashMap::new(),
values_writer,
end_offset: 0,
})
}
///
/// Append a page version to the file.
/// Append a key-value pair to the file.
///
/// 'buf' is a serialized PageVersion.
/// The page versions must be appended in blknum, lsn order.
/// The values must be appended in key, lsn order.
///
pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> {
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
//info!("DELTA: key {} at {} on {}", key, lsn, self.path.display());
assert!(self.lsn_range.start <= lsn);
// Remember the offset and size metadata. The metadata is written
// to a separate chapter, in `finish`.
let blob_range = BlobRange {
offset: self.pv_offset,
size: buf.len(),
};
self.page_version_metas
.append((blknum, lsn), blob_range)
.unwrap();
// write the page version
self.page_version_writer.write_all(buf)?;
self.pv_offset += buf.len() as u64;
let off = self.end_offset;
let len = utils::write_blob(&mut self.values_writer, &Value::ser(&val)?)?;
self.end_offset += len;
let vec_map = self.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
bail!(
"Value for {} at {} already exists in delta layer being built",
key,
lsn
);
}
Ok(())
}
pub fn size(&self) -> u64 {
self.end_offset
}
///
/// Finish writing the delta layer.
///
/// 'seg_sizes' is a list of size changes to store with the actual data.
///
pub fn finish(self, seg_sizes: VecMap<Lsn, SegmentBlk>) -> Result<DeltaLayer> {
// Close the page-versions chapter
let book = self.page_version_writer.close()?;
pub fn finish(self, key_end: Key) -> Result<DeltaLayer> {
// Close the values chapter
let book = self.values_writer.close()?;
// Write out page versions metadata
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
let buf = VecMap::ser(&self.page_version_metas)?;
chapter.write_all(&buf)?;
let book = chapter.close()?;
if self.seg.rel.is_blocky() {
assert!(!seg_sizes.is_empty());
}
// and seg_sizes to separate chapter
let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER);
let buf = VecMap::ser(&seg_sizes)?;
// Write out the index
let mut chapter = book.new_chapter(INDEX_CHAPTER);
let buf = HashMap::ser(&self.index)?;
chapter.write_all(&buf)?;
let book = chapter.close()?;
@@ -665,12 +582,8 @@ impl DeltaLayerWriter {
let summary = Summary {
tenantid: self.tenantid,
timelineid: self.timelineid,
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn: self.end_lsn,
dropped: self.dropped,
key_range: self.key_start..key_end,
lsn_range: self.lsn_range.clone(),
};
Summary::ser_into(&summary, &mut chapter)?;
let book = chapter.close()?;
@@ -685,20 +598,111 @@ impl DeltaLayerWriter {
path_or_conf: PathOrConf::Conf(self.conf),
tenantid: self.tenantid,
timelineid: self.timelineid,
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn: self.end_lsn,
dropped: self.dropped,
inner: Mutex::new(DeltaLayerInner {
key_range: self.key_start..key_end,
lsn_range: self.lsn_range.clone(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
index: HashMap::new(),
book: None,
page_version_metas: VecMap::default(),
seg_sizes: VecMap::default(),
}),
};
trace!("created delta layer {}", &layer.path().display());
// Rename the file to its final name
//
// Note: This overwrites any existing file. There shouldn't be any.
// FIXME: throw an error instead?
let final_path = DeltaLayer::path_for(
&PathOrConf::Conf(self.conf),
self.timelineid,
self.tenantid,
&DeltaFileName {
key_range: self.key_start..key_end,
lsn_range: self.lsn_range,
},
);
std::fs::rename(self.path, final_path)?;
Ok(layer)
}
pub fn abort(self) {
match self.values_writer.close() {
Ok(book) => {
if let Err(err) = book.close() {
error!("error while closing delta layer file: {}", err);
}
}
Err(err) => {
error!("error while closing chapter writer: {}", err);
}
}
if let Err(err) = std::fs::remove_file(self.path) {
error!("error removing unfinished delta layer file: {}", err);
}
}
}
struct DeltaValueIter<'a> {
all_offsets: Vec<(Key, Lsn, u64)>,
next_idx: usize,
inner: RwLockReadGuard<'a, DeltaLayerInner>,
}
impl<'a> Iterator for DeltaValueIter<'a> {
type Item = Result<(Key, Lsn, Value)>;
fn next(&mut self) -> Option<Self::Item> {
self.next_res().transpose()
}
}
///
/// Iterator over all key-value pairse stored in a delta layer
///
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
/// That takes up quite a lot of memory. Should do this in a more streaming
/// fashion.
///
impl<'a> DeltaValueIter<'a> {
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
let mut index: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
index.sort_by_key(|x| x.0);
let mut all_offsets: Vec<(Key, Lsn, u64)> = Vec::new();
for (key, vec_map) in index.iter() {
for (lsn, off) in vec_map.as_slice().iter() {
all_offsets.push((**key, *lsn, *off));
}
}
Ok(DeltaValueIter {
all_offsets,
inner,
next_idx: 0,
})
}
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
if self.next_idx < self.all_offsets.len() {
let (key, lsn, off) = self.all_offsets[self.next_idx];
let values_reader = self.inner
.book
.as_ref()
.expect("should be loaded in load call above")
.chapter_reader(VALUES_CHAPTER)?;
let val = Value::des(&utils::read_blob_from_chapter(&values_reader, off)?)?;
self.next_idx += 1;
Ok(Some((key, lsn, val)))
} else {
Ok(None)
}
}
}

View File

@@ -2,29 +2,52 @@
//! Helper functions for dealing with filenames of the image and delta layer files.
//!
use crate::config::PageServerConf;
use crate::layered_repository::storage_layer::SegmentTag;
use crate::relish::*;
use crate::repository::Key;
use std::cmp::Ordering;
use std::fmt;
use std::ops::Range;
use std::path::PathBuf;
use zenith_utils::lsn::Lsn;
// Note: LayeredTimeline::load_layer_map() relies on this sort order
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct DeltaFileName {
pub seg: SegmentTag,
pub start_lsn: Lsn,
pub end_lsn: Lsn,
pub dropped: bool,
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
}
impl PartialOrd for DeltaFileName {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for DeltaFileName {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp;
cmp = self.key_range.start.cmp(&other.key_range.start);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.key_range.end.cmp(&other.key_range.end);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.lsn_range.start.cmp(&other.lsn_range.start);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.lsn_range.end.cmp(&other.lsn_range.end);
cmp
}
}
/// Represents the filename of a DeltaLayer
///
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
///
/// or if it was dropped:
///
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
/// <seg start>-<seg end>-<LSN start>-<LSN end>
///
impl DeltaFileName {
///
@@ -32,234 +55,124 @@ impl DeltaFileName {
/// match the expected pattern.
///
pub fn parse_str(fname: &str) -> Option<Self> {
let rel;
let mut parts;
if let Some(rest) = fname.strip_prefix("rel_") {
parts = rest.split('_');
rel = RelishTag::Relation(RelTag {
spcnode: parts.next()?.parse::<u32>().ok()?,
dbnode: parts.next()?.parse::<u32>().ok()?,
relnode: parts.next()?.parse::<u32>().ok()?,
forknum: parts.next()?.parse::<u8>().ok()?,
});
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::Clog,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::MultiXactOffsets,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
parts = rest.split('_');
rel = RelishTag::FileNodeMap {
spcnode: parts.next()?.parse::<u32>().ok()?,
dbnode: parts.next()?.parse::<u32>().ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
parts = rest.split('_');
rel = RelishTag::TwoPhase {
xid: parts.next()?.parse::<u32>().ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
parts = rest.split('_');
rel = RelishTag::Checkpoint;
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
parts = rest.split('_');
rel = RelishTag::ControlFile;
} else {
let mut parts = fname.split("__");
let mut key_parts = parts.next()?.split('-');
let mut lsn_parts = parts.next()?.split('-');
let key_start_str = key_parts.next()?;
let key_end_str = key_parts.next()?;
let lsn_start_str = lsn_parts.next()?;
let lsn_end_str = lsn_parts.next()?;
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
return None;
}
let segno = parts.next()?.parse::<u32>().ok()?;
let key_start = Key::from_hex(key_start_str).ok()?;
let key_end = Key::from_hex(key_end_str).ok()?;
let seg = SegmentTag { rel, segno };
let start_lsn = Lsn::from_hex(lsn_start_str).ok()?;
let end_lsn = Lsn::from_hex(lsn_end_str).ok()?;
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
let mut dropped = false;
if let Some(suffix) = parts.next() {
if suffix == "DROPPED" {
dropped = true;
} else {
return None;
}
}
if parts.next().is_some() {
if start_lsn >= end_lsn {
return None;
// or panic?
}
if key_start >= key_end {
return None;
// or panic?
}
Some(DeltaFileName {
seg,
start_lsn,
end_lsn,
dropped,
key_range: key_start..key_end,
lsn_range: start_lsn..end_lsn,
})
}
}
impl fmt::Display for DeltaFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let basename = match self.seg.rel {
RelishTag::Relation(reltag) => format!(
"rel_{}_{}_{}_{}",
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
),
RelishTag::Slru {
slru: SlruKind::Clog,
segno,
} => format!("pg_xact_{:04X}", segno),
RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno,
} => format!("pg_multixact_members_{:04X}", segno),
RelishTag::Slru {
slru: SlruKind::MultiXactOffsets,
segno,
} => format!("pg_multixact_offsets_{:04X}", segno),
RelishTag::FileNodeMap { spcnode, dbnode } => {
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
}
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
RelishTag::ControlFile => "pg_control".to_string(),
};
write!(
f,
"{}_{}_{:016X}_{:016X}{}",
basename,
self.seg.segno,
u64::from(self.start_lsn),
u64::from(self.end_lsn),
if self.dropped { "_DROPPED" } else { "" }
"{}-{}__{:016X}-{:016X}",
self.key_range.start,
self.key_range.end,
u64::from(self.lsn_range.start),
u64::from(self.lsn_range.end),
)
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct ImageFileName {
pub seg: SegmentTag,
pub key_range: Range<Key>,
pub lsn: Lsn,
}
impl PartialOrd for ImageFileName {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for ImageFileName {
fn cmp(&self, other: &Self) -> Ordering {
let mut cmp;
cmp = self.key_range.start.cmp(&other.key_range.start);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.key_range.end.cmp(&other.key_range.end);
if cmp != Ordering::Equal {
return cmp;
}
cmp = self.lsn.cmp(&other.lsn);
cmp
}
}
///
/// Represents the filename of an ImageLayer
///
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<LSN>
///
/// FIXME
impl ImageFileName {
///
/// Parse a string as an image file name. Returns None if the filename does not
/// match the expected pattern.
///
pub fn parse_str(fname: &str) -> Option<Self> {
let rel;
let mut parts;
if let Some(rest) = fname.strip_prefix("rel_") {
parts = rest.split('_');
rel = RelishTag::Relation(RelTag {
spcnode: parts.next()?.parse::<u32>().ok()?,
dbnode: parts.next()?.parse::<u32>().ok()?,
relnode: parts.next()?.parse::<u32>().ok()?,
forknum: parts.next()?.parse::<u8>().ok()?,
});
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::Clog,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
parts = rest.split('_');
rel = RelishTag::Slru {
slru: SlruKind::MultiXactOffsets,
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
parts = rest.split('_');
rel = RelishTag::FileNodeMap {
spcnode: parts.next()?.parse::<u32>().ok()?,
dbnode: parts.next()?.parse::<u32>().ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
parts = rest.split('_');
rel = RelishTag::TwoPhase {
xid: parts.next()?.parse::<u32>().ok()?,
};
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
parts = rest.split('_');
rel = RelishTag::Checkpoint;
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
parts = rest.split('_');
rel = RelishTag::ControlFile;
} else {
let mut parts = fname.split("__");
let mut key_parts = parts.next()?.split('-');
let key_start_str = key_parts.next()?;
let key_end_str = key_parts.next()?;
let lsn_str = parts.next()?;
if parts.next().is_some() || key_parts.next().is_some() {
return None;
}
let segno = parts.next()?.parse::<u32>().ok()?;
let key_start = Key::from_hex(key_start_str).ok()?;
let key_end = Key::from_hex(key_end_str).ok()?;
let seg = SegmentTag { rel, segno };
let lsn = Lsn::from_hex(lsn_str).ok()?;
let lsn = Lsn::from_hex(parts.next()?).ok()?;
if parts.next().is_some() {
return None;
}
Some(ImageFileName { seg, lsn })
Some(ImageFileName {
key_range: key_start..key_end,
lsn,
})
}
}
impl fmt::Display for ImageFileName {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let basename = match self.seg.rel {
RelishTag::Relation(reltag) => format!(
"rel_{}_{}_{}_{}",
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
),
RelishTag::Slru {
slru: SlruKind::Clog,
segno,
} => format!("pg_xact_{:04X}", segno),
RelishTag::Slru {
slru: SlruKind::MultiXactMembers,
segno,
} => format!("pg_multixact_members_{:04X}", segno),
RelishTag::Slru {
slru: SlruKind::MultiXactOffsets,
segno,
} => format!("pg_multixact_offsets_{:04X}", segno),
RelishTag::FileNodeMap { spcnode, dbnode } => {
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
}
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
RelishTag::ControlFile => "pg_control".to_string(),
};
write!(
f,
"{}_{}_{:016X}",
basename,
self.seg.segno,
"{}-{}__{:016X}",
self.key_range.start,
self.key_range.end,
u64::from(self.lsn),
)
}

View File

@@ -1,142 +0,0 @@
//!
//! Global registry of open layers.
//!
//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
//! in-memory layers in the system, and know when we need to evict some to release
//! memory.
//!
//! Each layer is assigned a unique ID when it's registered in the global registry.
//! The ID can be used to relocate the layer later, without having to hold locks.
//!
use std::sync::atomic::{AtomicU8, Ordering};
use std::sync::{Arc, RwLock};
use super::inmemory_layer::InMemoryLayer;
use lazy_static::lazy_static;
const MAX_USAGE_COUNT: u8 = 5;
lazy_static! {
pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
RwLock::new(InMemoryLayers::default());
}
// TODO these types can probably be smaller
#[derive(PartialEq, Eq, Clone, Copy)]
pub struct LayerId {
index: usize,
tag: u64, // to avoid ABA problem
}
enum SlotData {
Occupied(Arc<InMemoryLayer>),
/// Vacant slots form a linked list, the value is the index
/// of the next vacant slot in the list.
Vacant(Option<usize>),
}
struct Slot {
tag: u64,
data: SlotData,
usage_count: AtomicU8, // for clock algorithm
}
#[derive(Default)]
pub struct InMemoryLayers {
slots: Vec<Slot>,
num_occupied: usize,
// Head of free-slot list.
next_empty_slot_idx: Option<usize>,
}
impl InMemoryLayers {
pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
let slot_idx = match self.next_empty_slot_idx {
Some(slot_idx) => slot_idx,
None => {
let idx = self.slots.len();
self.slots.push(Slot {
tag: 0,
data: SlotData::Vacant(None),
usage_count: AtomicU8::new(0),
});
idx
}
};
let slots_len = self.slots.len();
let slot = &mut self.slots[slot_idx];
match slot.data {
SlotData::Occupied(_) => {
panic!("an occupied slot was in the free list");
}
SlotData::Vacant(next_empty_slot_idx) => {
self.next_empty_slot_idx = next_empty_slot_idx;
}
}
slot.data = SlotData::Occupied(layer);
slot.usage_count.store(1, Ordering::Relaxed);
self.num_occupied += 1;
assert!(self.num_occupied <= slots_len);
LayerId {
index: slot_idx,
tag: slot.tag,
}
}
pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
if slot.tag != layer_id.tag {
return None;
}
if let SlotData::Occupied(layer) = &slot.data {
let _ = slot.usage_count.fetch_update(
Ordering::Relaxed,
Ordering::Relaxed,
|old_usage_count| {
if old_usage_count < MAX_USAGE_COUNT {
Some(old_usage_count + 1)
} else {
None
}
},
);
Some(Arc::clone(layer))
} else {
None
}
}
// TODO this won't be a public API in the future
pub fn remove(&mut self, layer_id: &LayerId) {
let slot = &mut self.slots[layer_id.index];
if slot.tag != layer_id.tag {
return;
}
match &slot.data {
SlotData::Occupied(_layer) => {
// TODO evict the layer
}
SlotData::Vacant(_) => unimplemented!(),
}
slot.data = SlotData::Vacant(self.next_empty_slot_idx);
self.next_empty_slot_idx = Some(layer_id.index);
assert!(self.num_occupied > 0);
self.num_occupied -= 1;
slot.tag = slot.tag.wrapping_add(1);
}
}

View File

@@ -4,38 +4,34 @@
//! On disk, the image files are stored in timelines/<timelineid> directory.
//! Currently, there are no subdirectories, and each image layer file is named like this:
//!
//! Note that segno is
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
//! <key start>-<key end>__<LSN>
//!
//! For example:
//!
//! 1663_13990_2609_0_5_000000000169C348
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
//!
//! An image file is constructed using the 'bookfile' crate.
//!
//! Only metadata is loaded into memory by the load function.
//! When images are needed, they are read directly from disk.
//!
//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
//! All the images are required to be BLOCK_SIZE, which allows for random access.
//!
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
//!
use crate::config::PageServerConf;
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag,
Layer, ValueReconstructResult, ValueReconstructState,
};
use crate::layered_repository::RELISH_SEG_SIZE;
use crate::layered_repository::utils;
use crate::repository::{Key, Value};
use crate::virtual_file::VirtualFile;
use crate::{ZTenantId, ZTimelineId};
use anyhow::{anyhow, bail, ensure, Context, Result};
use anyhow::{bail, Context, Result};
use bytes::Bytes;
use log::*;
use serde::{Deserialize, Serialize};
use std::convert::TryInto;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::{BufWriter, Write};
use std::ops::Range;
use std::path::{Path, PathBuf};
use std::sync::{Mutex, MutexGuard};
@@ -44,12 +40,16 @@ use bookfile::{Book, BookWriter, ChapterWriter};
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
// Magic constant to identify a Zenith segment image file
// Magic constant to identify a Zenith image layer file
// FIXME: bump all magics
pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
/// Mapping from (key, lsn) -> page/WAL record
/// byte ranges in VALUES_CHAPTER
static INDEX_CHAPTER: u64 = 1;
/// Contains each block in block # order
const BLOCKY_IMAGES_CHAPTER: u64 = 1;
const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
const VALUES_CHAPTER: u64 = 2;
/// Contains the [`Summary`] struct
const SUMMARY_CHAPTER: u64 = 3;
@@ -58,7 +58,7 @@ const SUMMARY_CHAPTER: u64 = 3;
struct Summary {
tenantid: ZTenantId,
timelineid: ZTimelineId,
seg: SegmentTag,
key_range: Range<Key>,
lsn: Lsn,
}
@@ -68,19 +68,17 @@ impl From<&ImageLayer> for Summary {
Self {
tenantid: layer.tenantid,
timelineid: layer.timelineid,
seg: layer.seg,
key_range: layer.key_range.clone(),
lsn: layer.lsn,
}
}
}
const BLOCK_SIZE: usize = 8192;
///
/// ImageLayer is the in-memory data structure associated with an on-disk image
/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a
/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'.
/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'.
/// Otherwise the struct is just a placeholder for a file that exists on disk,
/// and it needs to be loaded before using it in queries.
///
@@ -88,7 +86,7 @@ pub struct ImageLayer {
path_or_conf: PathOrConf,
pub tenantid: ZTenantId,
pub timelineid: ZTimelineId,
pub seg: SegmentTag,
pub key_range: Range<Key>,
// This entry contains an image of all pages as of this LSN
pub lsn: Lsn,
@@ -96,18 +94,15 @@ pub struct ImageLayer {
inner: Mutex<ImageLayerInner>,
}
#[derive(Clone)]
enum ImageType {
Blocky { num_blocks: SegmentBlk },
NonBlocky,
}
pub struct ImageLayerInner {
/// If None, the 'image_type' has not been loaded into memory yet.
/// If false, the 'index' has not been loaded into memory yet.
loaded: bool,
/// If None, the 'image_type' has not been loaded into memory yet. FIXME
book: Option<Book<VirtualFile>>,
/// Derived from filename and bookfile chapter metadata
image_type: ImageType,
/// offset of each value
index: HashMap<Key, u64>,
}
impl Layer for ImageLayer {
@@ -123,98 +118,80 @@ impl Layer for ImageLayer {
self.timelineid
}
fn get_seg_tag(&self) -> SegmentTag {
self.seg
fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
fn is_dropped(&self) -> bool {
false
}
fn get_start_lsn(&self) -> Lsn {
self.lsn
}
fn get_end_lsn(&self) -> Lsn {
fn get_lsn_range(&self) -> Range<Lsn> {
// End-bound is exclusive
self.lsn + 1
self.lsn..(self.lsn + 1)
}
/// Look up given page in the file
fn get_page_reconstruct_data(
fn get_value_reconstruct_data(
&self,
blknum: SegmentBlk,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<PageReconstructResult> {
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
assert!(lsn >= self.lsn);
lsn_floor: Lsn,
reconstruct_state: &mut ValueReconstructState,
) -> Result<ValueReconstructResult> {
assert!(lsn_floor <= self.lsn);
assert!(self.key_range.contains(&reconstruct_state.key));
assert!(reconstruct_state.lsn >= self.lsn);
match reconstruct_data.page_img {
match reconstruct_state.img {
Some((cached_lsn, _)) if self.lsn <= cached_lsn => {
return Ok(PageReconstructResult::Complete)
reconstruct_state.lsn = cached_lsn;
return Ok(ValueReconstructResult::Complete);
}
_ => {}
}
let inner = self.load()?;
let buf = match &inner.image_type {
ImageType::Blocky { num_blocks } => {
// Check if the request is beyond EOF
if blknum >= *num_blocks {
return Ok(PageReconstructResult::Missing(lsn));
}
if let Some(offset) = inner.index.get(&reconstruct_state.key) {
let chapter = inner
.book
.as_ref()
.unwrap()
.chapter_reader(VALUES_CHAPTER)?;
let mut buf = vec![0u8; BLOCK_SIZE];
let offset = BLOCK_SIZE as u64 * blknum as u64;
let blob = utils::read_blob_from_chapter(&chapter, *offset).with_context(|| {
format!(
"failed to read value from data file {} at offset {}",
self.filename().display(),
offset
)
})?;
let value = Bytes::from(blob);
let chapter = inner
.book
.as_ref()
.unwrap()
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
chapter.read_exact_at(&mut buf, offset).with_context(|| {
format!(
"failed to read page from data file {} at offset {}",
self.filename().display(),
offset
)
})?;
buf
}
ImageType::NonBlocky => {
ensure!(blknum == 0);
inner
.book
.as_ref()
.unwrap()
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
.into_vec()
}
};
reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf)));
Ok(PageReconstructResult::Complete)
}
/// Get size of the segment
fn get_seg_size(&self, _lsn: Lsn) -> Result<SegmentBlk> {
let inner = self.load()?;
match inner.image_type {
ImageType::Blocky { num_blocks } => Ok(num_blocks),
ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
reconstruct_state.img = Some((self.lsn, value));
reconstruct_state.lsn = self.lsn;
Ok(ValueReconstructResult::Complete)
} else {
reconstruct_state.lsn = self.lsn;
Ok(ValueReconstructResult::Missing)
}
}
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
Ok(true)
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()> {
let inner = self.load()?;
let index = &inner.index;
keys.extend(index.keys().filter(|x| key_range.contains(x)));
Ok(())
}
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
todo!();
}
fn unload(&self) -> Result<()> {
// TODO: unload 'segs'. Or even better, don't hold it in memory but
// access it directly from the file (using the buffer cache)
let mut inner = self.inner.lock().unwrap();
inner.index = HashMap::default();
inner.loaded = false;
Ok(())
}
@@ -235,22 +212,17 @@ impl Layer for ImageLayer {
/// debugging function to print out the contents of the layer
fn dump(&self) -> Result<()> {
println!(
"----- image layer for ten {} tli {} seg {} at {} ----",
self.tenantid, self.timelineid, self.seg, self.lsn
"----- image layer for ten {} tli {} key {}-{} at {} ----",
self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
);
let inner = self.load()?;
match inner.image_type {
ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
ImageType::NonBlocky => {
let chapter = inner
.book
.as_ref()
.unwrap()
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
println!("non-blocky ({} bytes)", chapter.len());
}
let mut index_vec: Vec<(&Key, &u64)> = inner.index.iter().collect();
index_vec.sort_by_key(|x| x.1);
for (key, offset) in index_vec {
println!("key: {} offset {}", key, offset);
}
Ok(())
@@ -279,19 +251,21 @@ impl ImageLayer {
// quick exit if already loaded
let mut inner = self.inner.lock().unwrap();
if inner.book.is_some() {
if inner.loaded {
return Ok(inner);
}
let path = self.path();
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
let book = Book::new(file).with_context(|| {
format!(
"Failed to open virtual file '{}' as a bookfile",
path.display()
)
})?;
// Open the file if it's not open already.
if inner.book.is_none() {
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
inner.book = Some(Book::new(file).with_context(|| {
format!("Failed to open file '{}' as a bookfile", path.display())
})?);
}
let book = inner.book.as_ref().unwrap();
match &self.path_or_conf {
PathOrConf::Conf(_) => {
@@ -318,23 +292,13 @@ impl ImageLayer {
}
}
let image_type = if self.seg.rel.is_blocky() {
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
let images_len = chapter.len();
ensure!(images_len % BLOCK_SIZE as u64 == 0);
let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?;
ImageType::Blocky { num_blocks }
} else {
let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
ImageType::NonBlocky
};
let chapter = book.read_chapter(INDEX_CHAPTER)?;
let index = HashMap::des(&chapter)?;
debug!("loaded from {}", &path.display());
info!("loaded from {}", &path.display());
*inner = ImageLayerInner {
book: Some(book),
image_type,
};
inner.index = index;
inner.loaded = true;
Ok(inner)
}
@@ -350,11 +314,12 @@ impl ImageLayer {
path_or_conf: PathOrConf::Conf(conf),
timelineid,
tenantid,
seg: filename.seg,
key_range: filename.key_range.clone(),
lsn: filename.lsn,
inner: Mutex::new(ImageLayerInner {
book: None,
image_type: ImageType::Blocky { num_blocks: 0 },
index: HashMap::new(),
loaded: false,
}),
}
}
@@ -373,18 +338,19 @@ impl ImageLayer {
path_or_conf: PathOrConf::Path(path.to_path_buf()),
timelineid: summary.timelineid,
tenantid: summary.tenantid,
seg: summary.seg,
key_range: summary.key_range,
lsn: summary.lsn,
inner: Mutex::new(ImageLayerInner {
book: None,
image_type: ImageType::Blocky { num_blocks: 0 },
index: HashMap::new(),
loaded: false,
}),
})
}
fn layer_name(&self) -> ImageFileName {
ImageFileName {
seg: self.seg,
key_range: self.key_range.clone(),
lsn: self.lsn,
}
}
@@ -413,15 +379,18 @@ impl ImageLayer {
///
pub struct ImageLayerWriter {
conf: &'static PageServerConf,
path: PathBuf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
key_range: Range<Key>,
lsn: Lsn,
num_blocks: SegmentBlk,
values_writer: Option<ChapterWriter<BufWriter<VirtualFile>>>,
end_offset: u64,
page_image_writer: ChapterWriter<BufWriter<VirtualFile>>,
num_blocks_written: SegmentBlk,
index: HashMap<Key, u64>,
finished: bool,
}
impl ImageLayerWriter {
@@ -429,9 +398,8 @@ impl ImageLayerWriter {
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
key_range: &Range<Key>,
lsn: Lsn,
num_blocks: SegmentBlk,
) -> Result<ImageLayerWriter> {
// Create the file
//
@@ -441,70 +409,74 @@ impl ImageLayerWriter {
&PathOrConf::Conf(conf),
timelineid,
tenantid,
&ImageFileName { seg, lsn },
&ImageFileName {
key_range: key_range.clone(),
lsn,
},
);
info!("new image layer {}", path.display());
let file = VirtualFile::create(&path)?;
let buf_writer = BufWriter::new(file);
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
// Open the page-images chapter for writing. The calls to
// `put_page_image` will use this to write the contents.
let chapter = if seg.rel.is_blocky() {
book.new_chapter(BLOCKY_IMAGES_CHAPTER)
} else {
assert_eq!(num_blocks, 1);
book.new_chapter(NONBLOCKY_IMAGE_CHAPTER)
};
// `put_image` will use this to write the contents.
let chapter = book.new_chapter(VALUES_CHAPTER);
let writer = ImageLayerWriter {
conf,
path,
timelineid,
tenantid,
seg,
key_range: key_range.clone(),
lsn,
num_blocks,
page_image_writer: chapter,
num_blocks_written: 0,
values_writer: Some(chapter),
index: HashMap::new(),
end_offset: 0,
finished: false,
};
Ok(writer)
}
///
/// Write next page image to the file.
/// Write next value to the file.
///
/// The page versions must be appended in blknum order.
///
pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> {
assert!(self.num_blocks_written < self.num_blocks);
if self.seg.rel.is_blocky() {
assert_eq!(block_bytes.len(), BLOCK_SIZE);
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
assert!(self.key_range.contains(&key));
let off = self.end_offset;
if let Some(writer) = &mut self.values_writer {
let len = utils::write_blob(writer, img)?;
self.end_offset += len;
let old = self.index.insert(key, off);
assert!(old.is_none());
} else {
panic!()
}
self.page_image_writer.write_all(block_bytes)?;
self.num_blocks_written += 1;
Ok(())
}
pub fn finish(self) -> Result<ImageLayer> {
// Check that the `put_page_image' was called for every block.
assert!(self.num_blocks_written == self.num_blocks);
pub fn finish(&mut self) -> Result<ImageLayer> {
// Close the values chapter
let book = self.values_writer.take().unwrap().close()?;
// Close the page-images chapter
let book = self.page_image_writer.close()?;
// Write out the index
let mut chapter = book.new_chapter(INDEX_CHAPTER);
let buf = HashMap::ser(&self.index)?;
chapter.write_all(&buf)?;
let book = chapter.close()?;
// Write out the summary chapter
let image_type = if self.seg.rel.is_blocky() {
ImageType::Blocky {
num_blocks: self.num_blocks,
}
} else {
ImageType::NonBlocky
};
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
let summary = Summary {
tenantid: self.tenantid,
timelineid: self.timelineid,
seg: self.seg,
key_range: self.key_range.clone(),
lsn: self.lsn,
};
Summary::ser_into(&summary, &mut chapter)?;
@@ -520,15 +492,31 @@ impl ImageLayerWriter {
path_or_conf: PathOrConf::Conf(self.conf),
timelineid: self.timelineid,
tenantid: self.tenantid,
seg: self.seg,
key_range: self.key_range.clone(),
lsn: self.lsn,
inner: Mutex::new(ImageLayerInner {
book: None,
image_type,
loaded: false,
index: HashMap::new(),
}),
};
trace!("created image layer {}", layer.path().display());
self.finished = true;
Ok(layer)
}
}
impl Drop for ImageLayerWriter {
fn drop(&mut self) {
if let Some(page_image_writer) = self.values_writer.take() {
if let Ok(book) = page_image_writer.close() {
let _ = book.close();
}
}
if !self.finished {
let _ = fs::remove_file(&self.path);
}
}
}

View File

@@ -1,30 +1,24 @@
//! An in-memory layer stores recently received PageVersions.
//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
//! and layers can be spilled to disk into ephemeral files.
//! An in-memory layer stores recently received key-value pairs.
//!
//! And there's another BTreeMap to track the size of the relation.
//! The "in-memory" part of the name is a bit misleading: the actual page versions are
//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
//! its position in the file, is kept in memory, though.
//!
use crate::config::PageServerConf;
use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
use crate::layered_repository::ephemeral_file::EphemeralFile;
use crate::layered_repository::filename::DeltaFileName;
use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter};
use crate::layered_repository::storage_layer::{
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
RELISH_SEG_SIZE,
Layer, ValueReconstructResult, ValueReconstructState,
};
use crate::layered_repository::LayeredTimeline;
use crate::layered_repository::ZERO_PAGE;
use crate::repository::ZenithWalRecord;
use crate::layered_repository::utils;
use crate::repository::{Key, Value};
use crate::{ZTenantId, ZTimelineId};
use anyhow::{ensure, Result};
use bytes::Bytes;
use anyhow::Result;
use log::*;
use std::collections::HashMap;
use std::io::Seek;
use std::os::unix::fs::FileExt;
use std::collections::{HashMap, HashSet};
use std::ops::Range;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use std::sync::RwLock;
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use zenith_utils::vec_map::VecMap;
@@ -33,7 +27,6 @@ pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenantid: ZTenantId,
timelineid: ZTimelineId,
seg: SegmentTag,
///
/// This layer contains all the changes from 'start_lsn'. The
@@ -42,7 +35,7 @@ pub struct InMemoryLayer {
start_lsn: Lsn,
///
/// LSN of the oldest page version stored in this layer.
/// LSN of the oldest value stored in this layer.
///
/// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
/// of a layer always matches the 'end_lsn' of its predecessor, even if there
@@ -59,9 +52,6 @@ pub struct InMemoryLayer {
/// The above fields never change. The parts that do change are in 'inner',
/// and protected by mutex.
inner: RwLock<InMemoryLayerInner>,
/// Predecessor layer might be needed?
incremental: bool,
}
pub struct InMemoryLayerInner {
@@ -69,98 +59,25 @@ pub struct InMemoryLayerInner {
/// Writes are only allowed when this is None
end_lsn: Option<Lsn>,
/// If this relation was dropped, remember when that happened.
/// The drop LSN is recorded in [`end_lsn`].
dropped: bool,
///
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The value is an offset into the
/// ephemeral file where the page version is stored.
///
index: HashMap<Key, VecMap<Lsn, u64>>,
/// The PageVersion structs are stored in a serialized format in this file.
/// Each serialized PageVersion is preceded by a 'u32' length field.
/// 'page_versions' map stores offsets into this file.
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
/// PerSeg::page_versions map stores offsets into this file.
file: EphemeralFile,
/// Metadata about all versions of all pages in the layer is kept
/// here. Indexed by block number and LSN. The value is an offset
/// into the ephemeral file where the page version is stored.
page_versions: HashMap<SegmentBlk, VecMap<Lsn, u64>>,
///
/// `seg_sizes` tracks the size of the segment at different points in time.
///
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
/// so that determining the size never depends on the predecessor layer. For
/// a non-blocky rel, 'seg_sizes' is not used and is always empty.
///
seg_sizes: VecMap<Lsn, SegmentBlk>,
///
/// LSN of the newest page version stored in this layer.
///
/// The difference between 'end_lsn' and 'latest_lsn' is the same as between
/// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'.
///
latest_lsn: Lsn,
end_offset: u64,
}
impl InMemoryLayerInner {
fn assert_writeable(&self) {
assert!(self.end_lsn.is_none());
}
fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk {
// Scan the BTreeMap backwards, starting from the given entry.
let slice = self.seg_sizes.slice_range(..=lsn);
// We make sure there is always at least one entry
if let Some((_entry_lsn, entry)) = slice.last() {
*entry
} else {
panic!("could not find seg size in in-memory layer");
}
}
///
/// Read a page version from the ephemeral file.
///
fn read_pv(&self, off: u64) -> Result<PageVersion> {
let mut buf = Vec::new();
self.read_pv_bytes(off, &mut buf)?;
Ok(PageVersion::des(&buf)?)
}
///
/// Read a page version from the ephemeral file, as raw bytes, at
/// the given offset. The bytes are read into 'buf', which is
/// expanded if necessary. Returns the size of the page version.
///
fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
// read length
let mut lenbuf = [0u8; 4];
self.file.read_exact_at(&mut lenbuf, off)?;
let len = u32::from_ne_bytes(lenbuf) as usize;
if buf.len() < len {
buf.resize(len, 0);
}
self.file.read_exact_at(&mut buf[0..len], off + 4)?;
Ok(len)
}
fn write_pv(&mut self, pv: &PageVersion) -> Result<u64> {
// remember starting position
let pos = self.file.stream_position()?;
// make room for the 'length' field by writing zeros as a placeholder.
self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
pv.ser_into(&mut self.file).unwrap();
// write the 'length' field.
let len = self.file.stream_position()? - pos - 4;
let lenbuf = u32::to_ne_bytes(len as u32);
self.file.write_all_at(&lenbuf, pos)?;
Ok(pos)
}
}
impl Layer for InMemoryLayer {
@@ -170,21 +87,12 @@ impl Layer for InMemoryLayer {
fn filename(&self) -> PathBuf {
let inner = self.inner.read().unwrap();
let end_lsn = if let Some(drop_lsn) = inner.end_lsn {
drop_lsn
} else {
Lsn(u64::MAX)
};
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
let delta_filename = DeltaFileName {
seg: self.seg,
start_lsn: self.start_lsn,
end_lsn,
dropped: inner.dropped,
}
.to_string();
PathBuf::from(format!("inmem-{}", delta_filename))
PathBuf::from(format!(
"inmem-{:016X}-{:016X}",
self.start_lsn.0, end_lsn.0
))
}
fn get_tenant_id(&self) -> ZTenantId {
@@ -195,132 +103,85 @@ impl Layer for InMemoryLayer {
self.timelineid
}
fn get_seg_tag(&self) -> SegmentTag {
self.seg
fn get_key_range(&self) -> Range<Key> {
Key::MIN..Key::MAX
}
fn get_start_lsn(&self) -> Lsn {
self.start_lsn
}
fn get_end_lsn(&self) -> Lsn {
fn get_lsn_range(&self) -> Range<Lsn> {
let inner = self.inner.read().unwrap();
if let Some(end_lsn) = inner.end_lsn {
let end_lsn = if let Some(end_lsn) = inner.end_lsn {
end_lsn
} else {
Lsn(u64::MAX)
}
};
self.start_lsn..end_lsn
}
fn is_dropped(&self) -> bool {
let inner = self.inner.read().unwrap();
inner.dropped
}
/// Look up given page in the cache.
fn get_page_reconstruct_data(
/// Look up given value in the layer.
fn get_value_reconstruct_data(
&self,
blknum: SegmentBlk,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<PageReconstructResult> {
lsn_floor: Lsn,
reconstruct_state: &mut ValueReconstructState,
) -> Result<ValueReconstructResult> {
assert!(lsn_floor <= self.start_lsn);
let mut need_image = true;
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
let inner = self.inner.read().unwrap();
{
let inner = self.inner.read().unwrap();
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.page_versions.get(&blknum) {
let slice = vec_map.slice_range(..=lsn);
for (entry_lsn, pos) in slice.iter().rev() {
match &reconstruct_data.page_img {
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
return Ok(PageReconstructResult::Complete)
}
_ => {}
// Scan the page versions backwards, starting from `lsn`.
if let Some(vec_map) = inner.index.get(&reconstruct_state.key) {
let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn);
for (entry_lsn, pos) in slice.iter().rev() {
match &reconstruct_state.img {
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
return Ok(ValueReconstructResult::Complete)
}
_ => {}
}
let pv = inner.read_pv(*pos)?;
match pv {
PageVersion::Page(img) => {
reconstruct_data.page_img = Some((*entry_lsn, img));
let value = Value::des(&utils::read_blob(&inner.file, *pos)?)?;
match value {
Value::Image(img) => {
reconstruct_state.img = Some((*entry_lsn, img));
reconstruct_state.lsn = *entry_lsn;
return Ok(ValueReconstructResult::Complete);
}
Value::WalRecord(rec) => {
let will_init = rec.will_init();
reconstruct_state.records.push((*entry_lsn, rec));
if will_init {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
PageVersion::Wal(rec) => {
reconstruct_data.records.push((*entry_lsn, rec.clone()));
if rec.will_init() {
// This WAL record initializes the page, so no need to go further back
need_image = false;
break;
}
}
}
}
}
// If we didn't find any records for this, check if the request is beyond EOF
if need_image
&& reconstruct_data.records.is_empty()
&& self.seg.rel.is_blocky()
&& blknum >= self.get_seg_size(lsn)?
{
return Ok(PageReconstructResult::Missing(self.start_lsn));
}
// release lock on 'inner'
}
// release lock on 'inner'
// If an older page image is needed to reconstruct the page, let the
// caller know
// caller know.
if need_image {
if self.incremental {
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
} else {
Ok(PageReconstructResult::Missing(self.start_lsn))
}
reconstruct_state.lsn = Lsn(self.start_lsn.0 - 1);
Ok(ValueReconstructResult::Continue)
} else {
Ok(PageReconstructResult::Complete)
Ok(ValueReconstructResult::Complete)
}
}
/// Get size of the relation at given LSN
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
assert!(lsn >= self.start_lsn);
ensure!(
self.seg.rel.is_blocky(),
"get_seg_size() called on a non-blocky rel"
);
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()> {
let inner = self.inner.read().unwrap();
Ok(inner.get_seg_size(lsn))
keys.extend(inner.index.keys().filter(|x| key_range.contains(x)));
Ok(())
}
/// Does this segment exist at given LSN?
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
let inner = self.inner.read().unwrap();
// If the segment created after requested LSN,
// it doesn't exist in the layer. But we shouldn't
// have requested it in the first place.
assert!(lsn >= self.start_lsn);
// Is the requested LSN after the segment was dropped?
if inner.dropped {
if let Some(end_lsn) = inner.end_lsn {
if lsn >= end_lsn {
return Ok(false);
}
} else {
panic!("dropped in-memory layer with no end LSN");
}
}
// Otherwise, it exists
Ok(true)
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
todo!();
}
/// Cannot unload anything in an in-memory layer, since there's no backing
@@ -337,7 +198,8 @@ impl Layer for InMemoryLayer {
}
fn is_incremental(&self) -> bool {
self.incremental
// in-memory layer is always considered incremental.
true
}
fn is_in_memory(&self) -> bool {
@@ -355,53 +217,39 @@ impl Layer for InMemoryLayer {
.unwrap_or_default();
println!(
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
"----- in-memory layer for tli {} LSNs {}-{} ----",
self.timelineid,
self.start_lsn,
end_str,
//inner.dropped,
);
for (k, v) in inner.seg_sizes.as_slice() {
println!("seg_sizes {}: {}", k, v);
}
// FIXME
/*
for (blknum, versions) in page_versions {
for (lsn, off) in versions.as_slice() {
let pv = inner.read_pv(*off);
let pv_description = match pv {
Ok(PageVersion::Page(_img)) => "page",
Ok(PageVersion::Wal(_rec)) => "wal",
Err(_err) => "INVALID",
};
// List the blocks in order
let mut page_versions: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> =
inner.page_versions.iter().collect();
page_versions.sort_by_key(|k| k.0);
for (blknum, versions) in page_versions {
for (lsn, off) in versions.as_slice() {
let pv = inner.read_pv(*off);
let pv_description = match pv {
Ok(PageVersion::Page(_img)) => "page",
Ok(PageVersion::Wal(_rec)) => "wal",
Err(_err) => "INVALID",
};
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
}
}
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
}
}
*/
Ok(())
}
}
/// A result of an inmemory layer data being written to disk.
pub struct LayersOnDisk {
pub delta_layers: Vec<DeltaLayer>,
pub image_layers: Vec<ImageLayer>,
}
impl InMemoryLayer {
/// Return the oldest page version that's stored in this layer
pub fn get_oldest_lsn(&self) -> Lsn {
self.oldest_lsn
}
pub fn get_latest_lsn(&self) -> Lsn {
let inner = self.inner.read().unwrap();
inner.latest_lsn
}
///
/// Create a new, empty, in-memory layer
///
@@ -409,268 +257,88 @@ impl InMemoryLayer {
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
seg: SegmentTag,
start_lsn: Lsn,
oldest_lsn: Lsn,
) -> Result<InMemoryLayer> {
trace!(
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
seg,
"initializing new empty InMemoryLayer for writing on timeline {} at {}",
timelineid,
start_lsn
);
// The segment is initially empty, so initialize 'seg_sizes' with 0.
let mut seg_sizes = VecMap::default();
if seg.rel.is_blocky() {
seg_sizes.append(start_lsn, 0).unwrap();
}
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
Ok(InMemoryLayer {
conf,
timelineid,
tenantid,
seg,
start_lsn,
oldest_lsn,
incremental: false,
inner: RwLock::new(InMemoryLayerInner {
end_lsn: None,
dropped: false,
index: HashMap::new(),
file,
page_versions: HashMap::new(),
seg_sizes,
latest_lsn: oldest_lsn,
end_offset: 0,
}),
})
}
// Write operations
/// Remember new page version, as a WAL record over previous version
pub fn put_wal_record(
&self,
lsn: Lsn,
blknum: SegmentBlk,
rec: ZenithWalRecord,
) -> Result<u32> {
self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
}
/// Remember new page version, as a full page image
pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result<u32> {
self.put_page_version(blknum, lsn, PageVersion::Page(img))
}
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result<u32> {
assert!((0..RELISH_SEG_SIZE).contains(&blknum));
trace!(
"put_page_version blk {} of {} at {}/{}",
blknum,
self.seg.rel,
self.timelineid,
lsn
);
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
let mut inner = self.inner.write().unwrap();
inner.assert_writeable();
assert!(lsn >= inner.latest_lsn);
inner.latest_lsn = lsn;
// Write the page version to the file, and remember its offset in 'page_versions'
{
let off = inner.write_pv(&pv)?;
let vec_map = inner.page_versions.entry(blknum).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
warn!(
"Page version of rel {} blk {} at {} already exists",
self.seg.rel, blknum, lsn
);
}
}
// Also update the relation size, if this extended the relation.
if self.seg.rel.is_blocky() {
let newsize = blknum + 1;
// use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
// which we've just acquired above
let oldsize = inner.get_seg_size(lsn);
if newsize > oldsize {
trace!(
"enlarging segment {} from {} to {} blocks at {}",
self.seg,
oldsize,
newsize,
lsn
);
// If we are extending the relation by more than one page, initialize the "gap"
// with zeros
//
// XXX: What if the caller initializes the gap with subsequent call with same LSN?
// I don't think that can happen currently, but that is highly dependent on how
// PostgreSQL writes its WAL records and there's no guarantee of it. If it does
// happen, we would hit the "page version already exists" warning above on the
// subsequent call to initialize the gap page.
for gapblknum in oldsize..blknum {
let zeropv = PageVersion::Page(ZERO_PAGE.clone());
trace!(
"filling gap blk {} with zeros for write of {}",
gapblknum,
blknum
);
// Write the page version to the file, and remember its offset in
// 'page_versions'
{
let off = inner.write_pv(&zeropv)?;
let vec_map = inner.page_versions.entry(gapblknum).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
warn!(
"Page version of seg {} blk {} at {} already exists",
self.seg, gapblknum, lsn
);
}
}
}
inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap();
return Ok(newsize - oldsize);
}
}
Ok(0)
}
/// Remember that the relation was truncated at given LSN
pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) {
assert!(
self.seg.rel.is_blocky(),
"put_truncation() called on a non-blocky rel"
);
let mut inner = self.inner.write().unwrap();
inner.assert_writeable();
// check that this we truncate to a smaller size than segment was before the truncation
let old_size = inner.get_seg_size(lsn);
assert!(new_size < old_size);
let (old, _delta_size) = inner
.seg_sizes
.append_or_update_last(lsn, new_size)
.unwrap();
let off = inner.end_offset;
let len = utils::write_blob(&mut inner.file, &Value::ser(&val)?)?;
inner.end_offset += len;
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
warn!("Inserting truncation, but had an entry for the LSN already");
}
}
/// Remember that the segment was dropped at given LSN
pub fn drop_segment(&self, lsn: Lsn) {
let mut inner = self.inner.write().unwrap();
assert!(inner.end_lsn.is_none());
assert!(!inner.dropped);
inner.dropped = true;
assert!(self.start_lsn < lsn);
inner.end_lsn = Some(lsn);
trace!("dropped segment {} at {}", self.seg, lsn);
}
///
/// Initialize a new InMemoryLayer for, by copying the state at the given
/// point in time from given existing layer.
///
pub fn create_successor_layer(
conf: &'static PageServerConf,
src: Arc<dyn Layer>,
timelineid: ZTimelineId,
tenantid: ZTenantId,
start_lsn: Lsn,
oldest_lsn: Lsn,
) -> Result<InMemoryLayer> {
let seg = src.get_seg_tag();
assert!(oldest_lsn.is_aligned());
trace!(
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
seg,
timelineid,
start_lsn,
);
// Copy the segment size at the start LSN from the predecessor layer.
let mut seg_sizes = VecMap::default();
if seg.rel.is_blocky() {
let size = src.get_seg_size(start_lsn)?;
seg_sizes.append(start_lsn, size).unwrap();
warn!("Key {} at {} already exists", key, lsn);
}
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
Ok(InMemoryLayer {
conf,
timelineid,
tenantid,
seg,
start_lsn,
oldest_lsn,
incremental: true,
inner: RwLock::new(InMemoryLayerInner {
end_lsn: None,
dropped: false,
file,
page_versions: HashMap::new(),
seg_sizes,
latest_lsn: oldest_lsn,
}),
})
Ok(())
}
pub fn is_writeable(&self) -> bool {
let inner = self.inner.read().unwrap();
inner.end_lsn.is_none()
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
// TODO: Currently, we just leak the storage for any deleted keys
Ok(())
}
/// Make the layer non-writeable. Only call once.
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is inclusive
/// `end_lsn` is exclusive
pub fn freeze(&self, end_lsn: Lsn) {
let mut inner = self.inner.write().unwrap();
if inner.end_lsn.is_some() {
assert!(inner.dropped);
} else {
assert!(!inner.dropped);
assert!(self.start_lsn < end_lsn + 1);
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
assert!(self.start_lsn < end_lsn);
inner.end_lsn = Some(end_lsn);
if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() {
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
}
// FIXME
/*
for perseg in inner.segs.values() {
if let Some((lsn, _)) = perseg.seg_sizes.as_slice().last() {
assert!(lsn < &end_lsn, "{:?} {:?}", lsn, end_lsn);
}
for (_blk, vec_map) in inner.page_versions.iter() {
for (lsn, _pos) in vec_map.as_slice() {
assert!(*lsn <= end_lsn);
for (_blk, vec_map) in perseg.page_versions.iter() {
for (lsn, _pos) in vec_map.as_slice() {
assert!(*lsn < end_lsn);
}
}
}
}
}
*/
}
/// Write the this frozen in-memory layer to disk.
/// Write this frozen in-memory layer to disk.
///
/// Returns new layers that replace this one.
/// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions
@@ -678,17 +346,7 @@ impl InMemoryLayer {
/// WAL records between start and end LSN. (The delta layer is not needed
/// when a new relish is created with a single LSN, so that the start and
/// end LSN are the same.)
pub fn write_to_disk(
&self,
timeline: &LayeredTimeline,
reconstruct_pages: bool,
) -> Result<LayersOnDisk> {
trace!(
"write_to_disk {} get_end_lsn is {}",
self.filename().display(),
self.get_end_lsn()
);
pub fn write_to_disk(&self) -> Result<DeltaLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
@@ -700,105 +358,30 @@ impl InMemoryLayer {
// rare though, so we just accept the potential latency hit for now.
let inner = self.inner.read().unwrap();
// Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN
// that is included.
let end_lsn_exclusive = inner.end_lsn.unwrap();
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timelineid,
self.tenantid,
Key::MIN,
self.start_lsn..inner.end_lsn.unwrap(),
)?;
// Figure out if we should create a delta layer, image layer, or both.
let image_lsn: Option<Lsn>;
let delta_end_lsn: Option<Lsn>;
if self.is_dropped() || !reconstruct_pages {
// The segment was dropped. Create just a delta layer containing all the
// changes up to and including the drop.
delta_end_lsn = Some(end_lsn_exclusive);
image_lsn = None;
} else if self.start_lsn == end_lsn_inclusive {
// The layer contains exactly one LSN. It's enough to write an image
// layer at that LSN.
delta_end_lsn = None;
image_lsn = Some(end_lsn_inclusive);
} else {
// Create a delta layer with all the changes up to the end LSN,
// and an image layer at the end LSN.
//
// Note that we the delta layer does *not* include the page versions
// at the end LSN. They are included in the image layer, and there's
// no need to store them twice.
delta_end_lsn = Some(end_lsn_inclusive);
image_lsn = Some(end_lsn_inclusive);
}
let mut delta_layers = Vec::new();
let mut image_layers = Vec::new();
if let Some(delta_end_lsn) = delta_end_lsn {
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timelineid,
self.tenantid,
self.seg,
self.start_lsn,
delta_end_lsn,
self.is_dropped(),
)?;
// Write all page versions, in block + LSN order
let mut buf: Vec<u8> = Vec::new();
let pv_iter = inner.page_versions.iter();
let mut pages: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> = pv_iter.collect();
pages.sort_by_key(|(blknum, _vec_map)| *blknum);
for (blknum, vec_map) in pages {
let mut do_steps = || -> Result<()> {
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
if *lsn < delta_end_lsn {
let len = inner.read_pv_bytes(*pos, &mut buf)?;
delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?;
}
let val = Value::des(&utils::read_blob(&inner.file, *pos)?)?;
delta_layer_writer.put_value(*key, *lsn, val)?;
}
}
// Create seg_sizes
let seg_sizes = if delta_end_lsn == end_lsn_exclusive {
inner.seg_sizes.clone()
} else {
inner.seg_sizes.split_at(&end_lsn_exclusive).0
};
let delta_layer = delta_layer_writer.finish(seg_sizes)?;
delta_layers.push(delta_layer);
Ok(())
};
if let Err(err) = do_steps() {
delta_layer_writer.abort();
return Err(err);
}
drop(inner);
// Write a new base image layer at the cutoff point
if let Some(image_lsn) = image_lsn {
let size = if self.seg.rel.is_blocky() {
self.get_seg_size(image_lsn)?
} else {
1
};
let mut image_layer_writer = ImageLayerWriter::new(
self.conf,
self.timelineid,
self.tenantid,
self.seg,
image_lsn,
size,
)?;
for blknum in 0..size {
let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?;
image_layer_writer.put_page_image(&img)?;
}
let image_layer = image_layer_writer.finish()?;
image_layers.push(image_layer);
}
Ok(LayersOnDisk {
delta_layers,
image_layers,
})
let delta_layer = delta_layer_writer.finish(Key::MAX)?;
Ok(delta_layer)
}
}

View File

@@ -1,468 +0,0 @@
///
/// IntervalTree is data structure for holding intervals. It is generic
/// to make unit testing possible, but the only real user of it is the layer map,
///
/// It's inspired by the "segment tree" or a "statistic tree" as described in
/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
/// the points instead of a binary tree. This is called an "interval tree" instead
/// of "segment tree" because the term "segment" is already using Zenith to mean
/// something else. To add to the confusion, there is another data structure known
/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
/// for storing intervals, but this isn't that.
///
/// The basic idea is to have a B-tree of "interesting Points". At each Point,
/// there is a list of intervals that contain the point. The Points are formed
/// from the start bounds of each interval; there is a Point for each distinct
/// start bound.
///
/// Operations:
///
/// To find intervals that contain a given point, you search the b-tree to find
/// the nearest Point <= search key. Then you just return the list of intervals.
///
/// To insert an interval, find the Point with start key equal to the inserted item.
/// If the Point doesn't exist yet, create it, by copying all the items from the
/// previous Point that cover the new Point. Then walk right, inserting the new
/// interval to all the Points that are contained by the new interval (including the
/// newly created Point).
///
/// To remove an interval, you scan the tree for all the Points that are contained by
/// the removed interval, and remove it from the list in each Point.
///
/// Requirements and assumptions:
///
/// - Can store overlapping items
/// - But there are not many overlapping items
/// - The interval bounds don't change after it is added to the tree
/// - Intervals are uniquely identified by pointer equality. You must not be insert the
/// same interval object twice, and `remove` uses pointer equality to remove the right
/// interval. It is OK to have two intervals with the same bounds, however.
///
use std::collections::BTreeMap;
use std::fmt::Debug;
use std::ops::Range;
use std::sync::Arc;
pub struct IntervalTree<I: ?Sized>
where
I: IntervalItem,
{
points: BTreeMap<I::Key, Point<I>>,
}
struct Point<I: ?Sized> {
/// All intervals that contain this point, in no particular order.
///
/// We assume that there aren't a lot of overlappingg intervals, so that this vector
/// never grows very large. If that assumption doesn't hold, we could keep this ordered
/// by the end bound, to speed up `search`. But as long as there are only a few elements,
/// a linear search is OK.
elements: Vec<Arc<I>>,
}
/// Abstraction for an interval that can be stored in the tree
///
/// The start bound is inclusive and the end bound is exclusive. End must be greater
/// than start.
pub trait IntervalItem {
type Key: Ord + Copy + Debug + Sized;
fn start_key(&self) -> Self::Key;
fn end_key(&self) -> Self::Key;
fn bounds(&self) -> Range<Self::Key> {
self.start_key()..self.end_key()
}
}
impl<I: ?Sized> IntervalTree<I>
where
I: IntervalItem,
{
/// Return an element that contains 'key', or precedes it.
///
/// If there are multiple candidates, returns the one with the highest 'end' key.
pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
// Find the greatest point that precedes or is equal to the search key. If there is
// none, returns None.
let (_, p) = self.points.range(..=key).next_back()?;
// Find the element with the highest end key at this point
let highest_item = p
.elements
.iter()
.reduce(|a, b| {
// starting with Rust 1.53, could use `std::cmp::min_by_key` here
if a.end_key() > b.end_key() {
a
} else {
b
}
})
.unwrap();
Some(Arc::clone(highest_item))
}
/// Iterate over all items with start bound >= 'key'
pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
IntervalIter {
point_iter: self.points.range(key..),
elem_iter: None,
}
}
/// Iterate over all items
pub fn iter(&self) -> IntervalIter<I> {
IntervalIter {
point_iter: self.points.range(..),
elem_iter: None,
}
}
pub fn insert(&mut self, item: Arc<I>) {
let start_key = item.start_key();
let end_key = item.end_key();
assert!(start_key < end_key);
let bounds = start_key..end_key;
// Find the starting point and walk forward from there
let mut found_start_point = false;
let iter = self.points.range_mut(bounds);
for (point_key, point) in iter {
if *point_key == start_key {
found_start_point = true;
// It is an error to insert the same item to the tree twice.
assert!(
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
"interval is already in the tree"
);
}
point.elements.push(Arc::clone(&item));
}
if !found_start_point {
// Create a new Point for the starting point
// Look at the previous point, and copy over elements that overlap with this
// new point
let mut new_elements: Vec<Arc<I>> = Vec::new();
if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
let overlapping_prev_elements = prev_point
.elements
.iter()
.filter(|x| x.bounds().contains(&start_key))
.cloned();
new_elements.extend(overlapping_prev_elements);
}
new_elements.push(item);
let new_point = Point {
elements: new_elements,
};
self.points.insert(start_key, new_point);
}
}
pub fn remove(&mut self, item: &Arc<I>) {
// range search points
let start_key = item.start_key();
let end_key = item.end_key();
let bounds = start_key..end_key;
let mut points_to_remove: Vec<I::Key> = Vec::new();
let mut found_start_point = false;
for (point_key, point) in self.points.range_mut(bounds) {
if *point_key == start_key {
found_start_point = true;
}
let len_before = point.elements.len();
point.elements.retain(|other| !Arc::ptr_eq(other, item));
let len_after = point.elements.len();
assert_eq!(len_after + 1, len_before);
if len_after == 0 {
points_to_remove.push(*point_key);
}
}
assert!(found_start_point);
for k in points_to_remove {
self.points.remove(&k).unwrap();
}
}
}
pub struct IntervalIter<'a, I: ?Sized>
where
I: IntervalItem,
{
point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
}
impl<'a, I> Iterator for IntervalIter<'a, I>
where
I: IntervalItem + ?Sized,
{
type Item = Arc<I>;
fn next(&mut self) -> Option<Self::Item> {
// Iterate over all elements in all the points in 'point_iter'. To avoid
// returning the same element twice, we only return each element at its
// starting point.
loop {
// Return next remaining element from the current point
if let Some((point_key, elem_iter)) = &mut self.elem_iter {
for elem in elem_iter {
if elem.start_key() == *point_key {
return Some(Arc::clone(elem));
}
}
}
// No more elements at this point. Move to next point.
if let Some((point_key, point)) = self.point_iter.next() {
self.elem_iter = Some((*point_key, point.elements.iter()));
continue;
} else {
// No more points, all done
return None;
}
}
}
}
impl<I: ?Sized> Default for IntervalTree<I>
where
I: IntervalItem,
{
fn default() -> Self {
IntervalTree {
points: BTreeMap::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fmt;
#[derive(Debug)]
struct MockItem {
start_key: u32,
end_key: u32,
val: String,
}
impl IntervalItem for MockItem {
type Key = u32;
fn start_key(&self) -> u32 {
self.start_key
}
fn end_key(&self) -> u32 {
self.end_key
}
}
impl MockItem {
fn new(start_key: u32, end_key: u32) -> Self {
MockItem {
start_key,
end_key,
val: format!("{}-{}", start_key, end_key),
}
}
fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
MockItem {
start_key,
end_key,
val: format!("{}-{}: {}", start_key, end_key, val),
}
}
}
impl fmt::Display for MockItem {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.val)
}
}
#[rustfmt::skip]
fn assert_search(
tree: &IntervalTree<MockItem>,
key: u32,
expected: &[&str],
) -> Option<Arc<MockItem>> {
if let Some(v) = tree.search(key) {
let vstr = v.to_string();
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
assert!(
expected.contains(&vstr.as_str()),
"search with {} returned {}, expected one of: {:?}",
key, v, expected,
);
Some(v)
} else {
assert!(
expected.is_empty(),
"search with {} returned None, expected one of {:?}",
key, expected
);
None
}
}
fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
contents.sort();
assert_eq!(contents, expected);
}
fn dump_tree(tree: &IntervalTree<MockItem>) {
for (point_key, point) in tree.points.iter() {
print!("{}:", point_key);
for e in point.elements.iter() {
print!(" {}", e);
}
println!();
}
}
#[test]
fn test_interval_tree_simple() {
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
// Simple, non-overlapping ranges.
tree.insert(Arc::new(MockItem::new(10, 11)));
tree.insert(Arc::new(MockItem::new(11, 12)));
tree.insert(Arc::new(MockItem::new(12, 13)));
tree.insert(Arc::new(MockItem::new(18, 19)));
tree.insert(Arc::new(MockItem::new(17, 18)));
tree.insert(Arc::new(MockItem::new(15, 16)));
assert_search(&tree, 9, &[]);
assert_search(&tree, 10, &["10-11"]);
assert_search(&tree, 11, &["11-12"]);
assert_search(&tree, 12, &["12-13"]);
assert_search(&tree, 13, &["12-13"]);
assert_search(&tree, 14, &["12-13"]);
assert_search(&tree, 15, &["15-16"]);
assert_search(&tree, 16, &["15-16"]);
assert_search(&tree, 17, &["17-18"]);
assert_search(&tree, 18, &["18-19"]);
assert_search(&tree, 19, &["18-19"]);
assert_search(&tree, 20, &["18-19"]);
// remove a few entries and search around them again
tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
assert_search(&tree, 9, &[]);
assert_search(&tree, 10, &[]);
assert_search(&tree, 11, &["11-12"]);
assert_search(&tree, 12, &["11-12"]);
assert_search(&tree, 14, &["11-12"]);
assert_search(&tree, 15, &["15-16"]);
assert_search(&tree, 17, &["17-18"]);
assert_search(&tree, 18, &["17-18"]);
}
#[test]
fn test_interval_tree_overlap() {
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
// Overlapping items
tree.insert(Arc::new(MockItem::new(22, 24)));
tree.insert(Arc::new(MockItem::new(23, 25)));
let x24_26 = Arc::new(MockItem::new(24, 26));
tree.insert(Arc::clone(&x24_26));
let x26_28 = Arc::new(MockItem::new(26, 28));
tree.insert(Arc::clone(&x26_28));
tree.insert(Arc::new(MockItem::new(25, 27)));
assert_search(&tree, 22, &["22-24"]);
assert_search(&tree, 23, &["22-24", "23-25"]);
assert_search(&tree, 24, &["23-25", "24-26"]);
assert_search(&tree, 25, &["24-26", "25-27"]);
assert_search(&tree, 26, &["25-27", "26-28"]);
assert_search(&tree, 27, &["26-28"]);
assert_search(&tree, 28, &["26-28"]);
assert_search(&tree, 29, &["26-28"]);
tree.remove(&x24_26);
tree.remove(&x26_28);
assert_search(&tree, 23, &["22-24", "23-25"]);
assert_search(&tree, 24, &["23-25"]);
assert_search(&tree, 25, &["25-27"]);
assert_search(&tree, 26, &["25-27"]);
assert_search(&tree, 27, &["25-27"]);
assert_search(&tree, 28, &["25-27"]);
assert_search(&tree, 29, &["25-27"]);
}
#[test]
fn test_interval_tree_nested() {
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
// Items containing other items
tree.insert(Arc::new(MockItem::new(31, 39)));
tree.insert(Arc::new(MockItem::new(32, 34)));
tree.insert(Arc::new(MockItem::new(33, 35)));
tree.insert(Arc::new(MockItem::new(30, 40)));
assert_search(&tree, 30, &["30-40"]);
assert_search(&tree, 31, &["30-40", "31-39"]);
assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
assert_search(&tree, 35, &["30-40", "31-39"]);
assert_search(&tree, 36, &["30-40", "31-39"]);
assert_search(&tree, 37, &["30-40", "31-39"]);
assert_search(&tree, 38, &["30-40", "31-39"]);
assert_search(&tree, 39, &["30-40"]);
assert_search(&tree, 40, &["30-40"]);
assert_search(&tree, 41, &["30-40"]);
}
#[test]
fn test_interval_tree_duplicates() {
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
// Duplicate keys
let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
tree.insert(Arc::clone(&item_a));
let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
tree.insert(Arc::clone(&item_b));
let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
tree.insert(Arc::clone(&item_c));
let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
tree.insert(Arc::clone(&item_d));
let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
tree.insert(Arc::clone(&item_e));
dump_tree(&tree);
assert_search(
&tree,
55,
&["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
);
tree.remove(&item_b);
dump_tree(&tree);
assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
tree.remove(&item_d);
dump_tree(&tree);
assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
}
#[test]
#[should_panic]
fn test_interval_tree_insert_twice() {
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
// Inserting the same item twice is not cool
let item = Arc::new(MockItem::new(1, 2));
tree.insert(Arc::clone(&item));
tree.insert(Arc::clone(&item)); // fails assertion
}
}

View File

@@ -3,30 +3,26 @@
//!
//! When the timeline is first accessed, the server lists of all layer files
//! in the timelines/<timelineid> directory, and populates this map with
//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL
//! is received, we create InMemoryLayers to hold the incoming records. Now and
//! then, in the checkpoint() function, the in-memory layers are frozen, forming
//! new image and delta layers and corresponding files are written to disk.
//! ImageLayer and DeltaLayer structs corresponding to each file. When the first
//! new WAL record is received, we create an InMemoryLayer to hold the incoming
//! records. Now and then, in the checkpoint() function, the in-memory layer is
//! are frozen, and it is split up into new image and delta layers and the
//! corresponding files are written to disk.
//!
use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
use crate::layered_repository::storage_layer::range_overlaps;
use crate::layered_repository::storage_layer::Layer;
use crate::layered_repository::InMemoryLayer;
use crate::relish::*;
use crate::repository::Key;
use anyhow::Result;
use lazy_static::lazy_static;
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::Arc;
use tracing::*;
use zenith_metrics::{register_int_gauge, IntGauge};
use zenith_utils::lsn::Lsn;
use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
lazy_static! {
static ref NUM_INMEMORY_LAYERS: IntGauge =
register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
.expect("failed to define a metric");
static ref NUM_ONDISK_LAYERS: IntGauge =
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
.expect("failed to define a metric");
@@ -37,108 +33,147 @@ lazy_static! {
///
#[derive(Default)]
pub struct LayerMap {
/// All the layers keyed by segment tag
segs: HashMap<SegmentTag, SegEntry>,
//
// 'open_layer' holds the current InMemoryLayer that is accepting new
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
// where the start LSN of the next InMemoryLayer that is to be created.
//
pub open_layer: Option<Arc<InMemoryLayer>>,
pub next_open_layer_at: Option<Lsn>,
/// All in-memory layers, ordered by 'oldest_lsn' and generation
/// of each layer. This allows easy access to the in-memory layer that
/// contains the oldest WAL record.
open_layers: BinaryHeap<OpenLayerEntry>,
///
/// The frozen layer, if any, contains WAL older than the current 'open_layer'
/// or 'next_open_layer_at', but newer than any historic layer. The frozen
/// layer is during checkpointing, when an InMemoryLayer is being written out
/// to disk.
///
pub frozen_layer: Option<Arc<InMemoryLayer>>,
/// Generation number, used to distinguish newly inserted entries in the
/// binary heap from older entries during checkpoint.
current_generation: u64,
/// All the historic layers are kept here
/// TODO: This is a placeholder implementation of a data structure
/// to hold information about all the layer files on disk and in
/// S3. Currently, it's just a vector and all operations perform a
/// linear scan over it. That obviously becomes slow as the
/// number of layers grows. I'm imagining that an R-tree or some
/// other 2D data structure would be the long-term solution here.
historic_layers: Vec<Arc<dyn Layer>>,
}
pub struct SearchResult {
pub layer: Arc<dyn Layer>,
pub lsn_floor: Lsn,
}
impl LayerMap {
///
/// Look up a layer using the given segment tag and LSN. This differs from a
/// plain key-value lookup in that if there is any layer that covers the
/// given LSN, or precedes the given LSN, it is returned. In other words,
/// you don't need to know the exact start LSN of the layer.
///
pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
let segentry = self.segs.get(tag)?;
pub fn search(&self, key: Key, lsn: Lsn) -> Result<Option<SearchResult>> {
// linear search
// Find the latest image layer that covers the given key
let mut latest_img: Option<Arc<dyn Layer>> = None;
let mut latest_img_lsn: Option<Lsn> = None;
for l in self.historic_layers.iter() {
if l.is_incremental() {
continue;
}
if !l.get_key_range().contains(&key) {
continue;
}
let img_lsn = l.get_lsn_range().start;
segentry.get(lsn)
}
if img_lsn > lsn {
// too new
continue;
}
if img_lsn == lsn {
// found exact match
return Ok(Some(SearchResult {
layer: Arc::clone(l),
lsn_floor: lsn,
}));
}
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
latest_img = Some(Arc::clone(l));
latest_img_lsn = Some(img_lsn);
}
}
///
/// Get the open layer for given segment for writing. Or None if no open
/// layer exists.
///
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
let segentry = self.segs.get(tag)?;
segentry
.open_layer_id
.and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
}
///
/// Insert an open in-memory layer
///
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
let layer_id = segentry.update_open(Arc::clone(&layer));
let oldest_lsn = layer.get_oldest_lsn();
// After a crash and restart, 'oldest_lsn' of the oldest in-memory
// layer becomes the WAL streaming starting point, so it better not point
// in the middle of a WAL record.
assert!(oldest_lsn.is_aligned());
// Also add it to the binary heap
let open_layer_entry = OpenLayerEntry {
oldest_lsn: layer.get_oldest_lsn(),
layer_id,
generation: self.current_generation,
};
self.open_layers.push(open_layer_entry);
NUM_INMEMORY_LAYERS.inc();
}
/// Remove an open in-memory layer
pub fn remove_open(&mut self, layer_id: LayerId) {
// Note: we don't try to remove the entry from the binary heap.
// It will be removed lazily by peek_oldest_open() when it's made it to
// the top of the heap.
let layer_opt = {
let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
let layer_opt = global_map.get(&layer_id);
global_map.remove(&layer_id);
// TODO it's bad that a ref can still exist after being evicted from cache
layer_opt
};
if let Some(layer) = layer_opt {
let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
if segentry.open_layer_id == Some(layer_id) {
// Also remove it from the SegEntry of this segment
segentry.open_layer_id = None;
} else {
// We could have already updated segentry.open for
// dropped (non-writeable) layer. This is fine.
assert!(!layer.is_writeable());
assert!(layer.is_dropped());
// Search the delta layers
let mut latest_delta: Option<Arc<dyn Layer>> = None;
for l in self.historic_layers.iter() {
if !l.is_incremental() {
continue;
}
if !l.get_key_range().contains(&key) {
continue;
}
NUM_INMEMORY_LAYERS.dec();
if l.get_lsn_range().start > lsn {
// too new
continue;
}
if l.get_lsn_range().end > lsn {
// this layer contains the requested point in the key/lsn space.
// No need to search any further
info!(
"found layer {} for request on {} at {}",
l.filename().display(),
key,
lsn
);
latest_delta.replace(Arc::clone(l));
break;
}
// this layer's end LSN is smaller than the requested point. If there's
// nothing newer, this is what we need to return. Remember this.
if let Some(ref old_candidate) = latest_delta {
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
latest_delta.replace(Arc::clone(l));
}
} else {
latest_delta.replace(Arc::clone(l));
}
}
if let Some(l) = latest_delta {
info!(
"found (old) layer {} for request on {} at {}",
l.filename().display(),
key,
lsn
);
Ok(Some(SearchResult {
lsn_floor: latest_img_lsn.unwrap_or(l.get_lsn_range().start),
layer: l,
}))
} else if let Some(l) = latest_img {
info!("found img layer and no deltas for request on {} at {}", key, lsn);
Ok(Some(SearchResult {
lsn_floor: latest_img_lsn.unwrap(),
layer: l,
}))
} else {
info!("no layer found for request on {} at {}", key, lsn);
Ok(None)
}
}
pub fn image_exists(&self, key_range: &Range<Key>, lsn: Lsn) -> bool {
for l in self.historic_layers.iter() {
if !l.is_incremental()
&& l.get_key_range() == *key_range
&& l.get_lsn_range().start == lsn
{
return true;
}
}
false
}
///
/// Insert an on-disk layer
///
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
segentry.insert_historic(layer);
self.historic_layers.push(layer);
NUM_ONDISK_LAYERS.inc();
}
@@ -147,61 +182,63 @@ impl LayerMap {
///
/// This should be called when the corresponding file on disk has been deleted.
///
#[allow(dead_code)]
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
let tag = layer.get_seg_tag();
let len_before = self.historic_layers.len();
if let Some(segentry) = self.segs.get_mut(&tag) {
segentry.historic.remove(&layer);
}
// FIXME: ptr_eq might fail to return true for 'dyn'
// references. Clippy complains about this. In practice it
// seems to work, the assertion below would be triggered
// otherwise but this ought to be fixed.
#[allow(clippy::vtable_address_comparisons)]
self.historic_layers
.retain(|other| !Arc::ptr_eq(other, &layer));
assert_eq!(self.historic_layers.len(), len_before - 1);
NUM_ONDISK_LAYERS.dec();
}
// List relations along with a flag that marks if they exist at the given lsn.
// spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases.
// Pass Tag if we're only interested in some relations.
pub fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashMap<RelishTag, bool>> {
let mut rels: HashMap<RelishTag, bool> = HashMap::new();
for (seg, segentry) in self.segs.iter() {
match seg.rel {
RelishTag::Relation(reltag) => {
if let Some(request_rel) = tag {
if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
&& (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
{
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
rels.insert(seg.rel, exists);
}
}
}
}
_ => {
if tag == None {
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
rels.insert(seg.rel, exists);
}
}
}
}
}
Ok(rels)
}
/// Is there a newer image layer for given segment?
///
/// This is used for garbage collection, to determine if an old layer can
/// be deleted.
/// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
/// We also only look at historic layers
//#[allow(dead_code)]
pub fn newer_image_layer_exists(
&self,
seg: SegmentTag,
key_range: &Range<Key>,
lsn: Lsn,
disk_consistent_lsn: Lsn,
) -> bool {
if let Some(segentry) = self.segs.get(&seg) {
segentry.newer_image_layer_exists(lsn, disk_consistent_lsn)
} else {
false
) -> Result<bool> {
let mut range_remain = key_range.clone();
loop {
let mut made_progress = false;
for l in self.historic_layers.iter() {
if l.is_incremental() {
continue;
}
let img_lsn = l.get_lsn_range().start;
if !l.is_incremental() &&
l.get_key_range().contains(&range_remain.start) &&
img_lsn > lsn &&
img_lsn < disk_consistent_lsn
{
made_progress = true;
let img_key_end = l.get_key_range().end;
if img_key_end >= range_remain.end {
return Ok(true);
}
range_remain.start = img_key_end;
}
}
if !made_progress {
return Ok(false);
}
}
}
@@ -211,284 +248,139 @@ impl LayerMap {
/// used for garbage collection, to determine if some alive layer
/// exists at the lsn. If so, we shouldn't delete a newer dropped layer
/// to avoid incorrectly making it visible.
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
Ok(if let Some(segentry) = self.segs.get(&seg) {
segentry.exists_at_lsn(lsn)?.unwrap_or(false)
} else {
false
})
/*
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
Ok(if let Some(segentry) = self.historic_layers.get(&seg) {
segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
} else {
false
})
}
*/
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
self.historic_layers.iter()
}
/// Return the oldest in-memory layer, along with its generation number.
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
let global_map = GLOBAL_LAYER_MAP.read().unwrap();
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
// Find the last image layer that covers the key
let mut candidate_lsn = Lsn(0);
let mut candidate = None;
for l in self.historic_layers.iter() {
if l.is_incremental() {
continue;
}
while let Some(oldest_entry) = self.open_layers.peek() {
if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
} else {
self.open_layers.pop();
if !l.get_key_range().contains(&key) {
continue;
}
let this_lsn = l.get_lsn_range().start;
if this_lsn > lsn {
continue;
}
if this_lsn < candidate_lsn {
// our previous candidate was better
continue;
}
candidate_lsn = this_lsn;
candidate = Some(Arc::clone(l));
}
candidate
}
///
/// Divide the whole given range of keys into sub-ranges based on the latest
/// image layer that covers each range. (This is used when creating new
/// image layers)
///
// FIXME: clippy complains that the result type is very complex. She's probably
// right...
#[allow(clippy::type_complexity)]
pub fn image_coverage(
&self,
key_range: &Range<Key>,
lsn: Lsn,
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
let mut points: Vec<Key>;
points = vec![key_range.start];
for l in self.historic_layers.iter() {
if l.get_lsn_range().start > lsn {
continue;
}
let range = l.get_key_range();
if key_range.contains(&range.start) {
points.push(l.get_key_range().start);
}
if key_range.contains(&range.end) {
points.push(l.get_key_range().end);
}
}
None
}
points.push(key_range.end);
/// Increment the generation number used to stamp open in-memory layers. Layers
/// added with `insert_open` after this call will be associated with the new
/// generation. Returns the new generation number.
pub fn increment_generation(&mut self) -> u64 {
self.current_generation += 1;
self.current_generation
}
points.sort();
points.dedup();
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
HistoricLayerIter {
seg_iter: self.segs.iter(),
iter: None,
// Ok, we now have a list of "interesting" points in the key space
// For each range between the points, find the latest image
let mut start = *points.first().unwrap();
let mut ranges = Vec::new();
for end in points[1..].iter() {
let img = self.find_latest_image(start, lsn);
ranges.push((start..*end, img));
start = *end;
}
Ok(ranges)
}
pub fn get_deltas(
&self,
key_range: &Range<Key>,
lsn_range: &Range<Lsn>,
) -> Result<Vec<Arc<dyn Layer>>> {
let mut deltas = Vec::new();
for l in self.historic_layers.iter() {
if !l.is_incremental() {
continue;
}
if !range_overlaps(&l.get_lsn_range(), lsn_range) {
continue;
}
if !range_overlaps(&l.get_key_range(), key_range) {
continue;
}
deltas.push(Arc::clone(l));
}
Ok(deltas)
}
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
let mut deltas = Vec::new();
for l in self.historic_layers.iter() {
if !l.is_incremental() {
continue;
}
if l.get_key_range() != (Key::MIN..Key::MAX) {
continue;
}
deltas.push(Arc::clone(l));
}
Ok(deltas)
}
/// debugging function to print out the contents of the layer map
#[allow(unused)]
pub fn dump(&self) -> Result<()> {
println!("Begin dump LayerMap");
for (seg, segentry) in self.segs.iter() {
if let Some(open) = &segentry.open_layer_id {
if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
layer.dump()?;
} else {
println!("layer not found in global map");
}
}
for layer in segentry.historic.iter() {
layer.dump()?;
}
for layer in self.historic_layers.iter() {
layer.dump()?;
}
println!("End dump LayerMap");
Ok(())
}
}
impl IntervalItem for dyn Layer {
type Key = Lsn;
fn start_key(&self) -> Lsn {
self.get_start_lsn()
}
fn end_key(&self) -> Lsn {
self.get_end_lsn()
}
}
///
/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
/// associated with the segment.
///
/// The last layer that is open for writes is always an InMemoryLayer,
/// and is kept in a separate field, because there can be only one for
/// each segment. The older layers, stored on disk, are kept in an
/// IntervalTree.
#[derive(Default)]
struct SegEntry {
open_layer_id: Option<LayerId>,
historic: IntervalTree<dyn Layer>,
}
impl SegEntry {
/// Does the segment exist at given LSN?
/// Return None if object is not found in this SegEntry.
fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
if let Some(layer) = self.get(lsn) {
Ok(Some(layer.get_seg_exists(lsn)?))
} else {
Ok(None)
}
}
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
if let Some(open_layer_id) = &self.open_layer_id {
let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
if open_layer.get_start_lsn() <= lsn {
return Some(open_layer);
}
}
self.historic.search(lsn)
}
pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool {
// We only check on-disk layers, because
// in-memory layers are not durable
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
// OK for a delta layer to have end LSN 101, but if the end LSN
// is 102, then it might not have been fully flushed to disk
// before crash.
self.historic
.iter_newer(lsn)
.any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1)
}
// Set new open layer for a SegEntry.
// It's ok to rewrite previous open layer,
// but only if it is not writeable anymore.
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
if let Some(prev_open_layer_id) = &self.open_layer_id {
if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
{
assert!(!prev_open_layer.is_writeable());
}
}
let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
self.open_layer_id = Some(open_layer_id);
open_layer_id
}
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
self.historic.insert(layer);
}
}
/// Entry held in LayerMap::open_layers, with boilerplate comparison routines
/// to implement a min-heap ordered by 'oldest_lsn' and 'generation'
///
/// The generation number associated with each entry can be used to distinguish
/// recently-added entries (i.e after last call to increment_generation()) from older
/// entries with the same 'oldest_lsn'.
struct OpenLayerEntry {
oldest_lsn: Lsn, // copy of layer.get_oldest_lsn()
generation: u64,
layer_id: LayerId,
}
impl Ord for OpenLayerEntry {
fn cmp(&self, other: &Self) -> Ordering {
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that. Entries with identical oldest_lsn are ordered by generation
other
.oldest_lsn
.cmp(&self.oldest_lsn)
.then_with(|| other.generation.cmp(&self.generation))
}
}
impl PartialOrd for OpenLayerEntry {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for OpenLayerEntry {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for OpenLayerEntry {}
/// Iterator returned by LayerMap::iter_historic_layers()
pub struct HistoricLayerIter<'a> {
seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
iter: Option<IntervalIter<'a, dyn Layer>>,
}
impl<'a> Iterator for HistoricLayerIter<'a> {
type Item = Arc<dyn Layer>;
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
loop {
if let Some(x) = &mut self.iter {
if let Some(x) = x.next() {
return Some(Arc::clone(&x));
}
}
if let Some((_tag, segentry)) = self.seg_iter.next() {
self.iter = Some(segentry.historic.iter());
continue;
} else {
return None;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::PageServerConf;
use std::str::FromStr;
use zenith_utils::zid::{ZTenantId, ZTimelineId};
/// Arbitrary relation tag, for testing.
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
spcnode: 0,
dbnode: 111,
relnode: 1000,
forknum: 0,
});
lazy_static! {
static ref DUMMY_TIMELINEID: ZTimelineId =
ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
static ref DUMMY_TENANTID: ZTenantId =
ZTenantId::from_str("00000000000000000000000000000000").unwrap();
}
/// Construct a dummy InMemoryLayer for testing
fn dummy_inmem_layer(
conf: &'static PageServerConf,
segno: u32,
start_lsn: Lsn,
oldest_lsn: Lsn,
) -> Arc<InMemoryLayer> {
Arc::new(
InMemoryLayer::create(
conf,
*DUMMY_TIMELINEID,
*DUMMY_TENANTID,
SegmentTag {
rel: TESTREL_A,
segno,
},
start_lsn,
oldest_lsn,
)
.unwrap(),
)
}
#[test]
fn test_open_layers() -> Result<()> {
let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
let conf = Box::leak(Box::new(conf));
std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
let mut layers = LayerMap::default();
let gen1 = layers.increment_generation();
layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
let gen2 = layers.increment_generation();
layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
// A helper function (closure) to pop the next oldest open entry from the layer map,
// and assert that it is what we'd expect
let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
assert!(l.get_seg_tag().segno == expected_segno);
assert!(generation == expected_generation);
layers.remove_open(layer_id);
};
assert_pop_layer(0, gen1); // 0x100
assert_pop_layer(5, gen2); // 0x100
assert_pop_layer(3, gen1); // 0x110
assert_pop_layer(4, gen2); // 0x110
assert_pop_layer(2, gen1); // 0x120
assert_pop_layer(1, gen1); // 0x200
Ok(())
}
}

View File

@@ -2,75 +2,34 @@
//! Common traits and structs for layers
//!
use crate::relish::RelishTag;
use crate::repository::{BlockNumber, ZenithWalRecord};
use crate::repository::{Key, Value};
use crate::walrecord::ZenithWalRecord;
use crate::{ZTenantId, ZTimelineId};
use anyhow::Result;
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::collections::HashSet;
use std::ops::Range;
use std::path::PathBuf;
use zenith_utils::lsn::Lsn;
// Size of one segment in pages (10 MB)
pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
// in # of key-value pairs
// FIXME Size of one segment in pages (128 MB)
pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024;
pub const TARGET_FILE_SIZE: u32 = (TARGET_FILE_SIZE_BYTES / 8192) as u32;
///
/// Each relish stored in the repository is divided into fixed-sized "segments",
/// with 10 MB of key-space, or 1280 8k pages each.
///
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
pub struct SegmentTag {
pub rel: RelishTag,
pub segno: u32,
}
/// SegmentBlk represents a block number within a segment, or the size of segment.
///
/// This is separate from BlockNumber, which is used for block number within the
/// whole relish. Since this is just a type alias, the compiler will let you mix
/// them freely, but we use the type alias as documentation to make it clear
/// which one we're dealing with.
///
/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally
/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes
/// operations more verbose).
pub type SegmentBlk = u32;
impl fmt::Display for SegmentTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}.{}", self.rel, self.segno)
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
where
T: PartialOrd<T>,
{
if a.start < b.start {
a.end > b.start
} else {
b.end > a.start
}
}
impl SegmentTag {
/// Given a relish and block number, calculate the corresponding segment and
/// block number within the segment.
pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) {
(
SegmentTag {
rel,
segno: blknum / RELISH_SEG_SIZE,
},
blknum % RELISH_SEG_SIZE,
)
}
}
///
/// Represents a version of a page at a specific LSN. The LSN is the key of the
/// entry in the 'page_versions' hash, it is not duplicated here.
///
/// A page version can be stored as a full page image, or as WAL record that needs
/// to be applied over the previous page version to reconstruct this version.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum PageVersion {
Page(Bytes),
Wal(ZenithWalRecord),
}
///
/// FIXME
/// Struct used to communicate across calls to 'get_page_reconstruct_data'.
///
/// Before first call to get_page_reconstruct_data, you can fill in 'page_img'
@@ -88,25 +47,32 @@ pub enum PageVersion {
/// the same PageReconstructData struct in the next 'get_page_reconstruct_data'
/// call, to collect more records.
///
pub struct PageReconstructData {
#[derive(Debug)]
pub struct ValueReconstructState {
pub key: Key,
pub lsn: Lsn,
pub records: Vec<(Lsn, ZenithWalRecord)>,
pub page_img: Option<(Lsn, Bytes)>,
pub img: Option<(Lsn, Bytes)>,
pub request_lsn: Lsn, // original request's LSN, for debugging purposes
}
/// Return value from Layer::get_page_reconstruct_data
pub enum PageReconstructResult {
#[derive(Debug)]
pub enum ValueReconstructResult {
/// Got all the data needed to reconstruct the requested page
Complete,
/// This layer didn't contain all the required data, the caller should look up
/// the predecessor layer at the returned LSN and collect more data from there.
Continue(Lsn),
Continue,
/// This layer didn't contain data needed to reconstruct the page version at
/// the returned LSN. This is usually considered an error, but might be OK
/// in some circumstances.
Missing(Lsn),
Missing,
}
///
/// FIXME
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
/// layers are used to ingest incoming WAL, and provide fast access
@@ -120,21 +86,17 @@ pub trait Layer: Send + Sync {
/// Identify the timeline this relish belongs to
fn get_timeline_id(&self) -> ZTimelineId;
/// Identify the relish segment
fn get_seg_tag(&self) -> SegmentTag;
/// Range of segments that this layer covers
fn get_key_range(&self) -> Range<Key>;
/// FIXME
/// Inclusive start bound of the LSN range that this layer holds
fn get_start_lsn(&self) -> Lsn;
/// Exclusive end bound of the LSN range that this layer holds.
///
/// - For an open in-memory layer, this is MAX_LSN.
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
fn get_end_lsn(&self) -> Lsn;
/// Is the segment represented by this layer dropped by PostgreSQL?
fn is_dropped(&self) -> bool;
fn get_lsn_range(&self) -> Range<Lsn>;
/// Filename used to store this layer on disk. (Even in-memory layers
/// implement this, to print a handy unique identifier for the layer for
@@ -153,18 +115,11 @@ pub trait Layer: Send + Sync {
/// is available. If this returns PageReconstructResult::Continue, look up
/// the predecessor layer and call again with the same 'reconstruct_data' to
/// collect more data.
fn get_page_reconstruct_data(
fn get_value_reconstruct_data(
&self,
blknum: SegmentBlk,
lsn: Lsn,
reconstruct_data: &mut PageReconstructData,
) -> Result<PageReconstructResult>;
/// Return size of the segment at given LSN. (Only for blocky relations.)
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk>;
/// Does the segment exist at given LSN? Or was it dropped before it.
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
lsn_floor: Lsn,
reconstruct_data: &mut ValueReconstructState,
) -> Result<ValueReconstructResult>;
/// Does this layer only contain some data for the segment (incremental),
/// or does it contain a version of every page? This is important to know
@@ -175,6 +130,11 @@ pub trait Layer: Send + Sync {
/// Returns true for layers that are represented in memory.
fn is_in_memory(&self) -> bool;
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
/// Return a set of all distinct Keys present in this layer
fn collect_keys(&self, key_range: &Range<Key>, keys: &mut HashSet<Key>) -> Result<()>;
/// Release memory used by this layer. There is no corresponding 'load'
/// function, that's done implicitly when you call one of the get-functions.
fn unload(&self) -> Result<()>;

View File

@@ -0,0 +1,48 @@
// Utilities for reading and writing Values
use std::io::{Error, Write};
use std::os::unix::fs::FileExt;
use bookfile::BoundedReader;
pub fn read_blob<F: FileExt>(file: &F, off: u64) -> Result<Vec<u8>, Error> {
// read length
let mut len_buf = [0u8; 4];
file.read_exact_at(&mut len_buf, off)?;
let len = u32::from_ne_bytes(len_buf);
let mut buf: Vec<u8> = Vec::new();
buf.resize(len as usize, 0);
file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?;
Ok(buf)
}
pub fn read_blob_from_chapter<F: FileExt>(
file: &BoundedReader<&F>,
off: u64,
) -> Result<Vec<u8>, Error> {
// read length
let mut len_buf = [0u8; 4];
file.read_exact_at(&mut len_buf, off)?;
let len = u32::from_ne_bytes(len_buf);
let mut buf: Vec<u8> = Vec::new();
buf.resize(len as usize, 0);
file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?;
Ok(buf)
}
pub fn write_blob<W: Write>(writer: &mut W, buf: &[u8]) -> Result<u64, Error> {
let val_len = buf.len() as u32;
// write the 'length' field and kind byte.
let lenbuf = u32::to_ne_bytes(val_len);
writer.write_all(&lenbuf)?;
writer.write_all(buf)?;
Ok(4 + val_len as u64)
}

View File

@@ -6,6 +6,7 @@ pub mod import_datadir;
pub mod layered_repository;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
pub mod relish;
pub mod remote_storage;
pub mod repository;
@@ -22,7 +23,8 @@ use lazy_static::lazy_static;
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
use zenith_utils::zid::{ZTenantId, ZTimelineId};
use layered_repository::{LayeredRepository, LayeredTimeline};
use layered_repository::LayeredRepository;
use pgdatadir_mapping::DatadirTimeline;
lazy_static! {
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
@@ -47,5 +49,5 @@ pub enum CheckpointConfig {
}
pub type RepositoryImpl = LayeredRepository;
pub type TimelineImpl = LayeredTimeline;
pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;

View File

@@ -32,8 +32,10 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
use crate::basebackup;
use crate::config::PageServerConf;
use crate::pgdatadir_mapping::DatadirTimeline;
use crate::relish::*;
use crate::repository::{Repository, Timeline};
use crate::repository::Repository;
use crate::repository::Timeline;
use crate::tenant_mgr;
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
@@ -395,8 +397,8 @@ impl PageServerHandler {
/// In either case, if the page server hasn't received the WAL up to the
/// requested LSN yet, we will wait for it to arrive. The return value is
/// the LSN that should be used to look up the page versions.
fn wait_or_get_last_lsn<T: Timeline>(
timeline: &T,
fn wait_or_get_last_lsn<R: Repository>(
timeline: &DatadirTimeline<R>,
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
@@ -423,7 +425,7 @@ impl PageServerHandler {
if lsn <= last_record_lsn {
lsn = last_record_lsn;
} else {
timeline.wait_lsn(lsn)?;
timeline.tline.wait_lsn(lsn)?;
// Since we waited for 'lsn' to arrive, that is now the last
// record LSN. (Or close enough for our purposes; the
// last-record LSN can advance immediately after we return
@@ -433,7 +435,7 @@ impl PageServerHandler {
if lsn == Lsn(0) {
bail!("invalid LSN(0) in request");
}
timeline.wait_lsn(lsn)?;
timeline.tline.wait_lsn(lsn)?;
}
ensure!(
lsn >= **latest_gc_cutoff_lsn,
@@ -443,54 +445,47 @@ impl PageServerHandler {
Ok(lsn)
}
fn handle_get_rel_exists_request<T: Timeline>(
fn handle_get_rel_exists_request<R: Repository>(
&self,
timeline: &T,
timeline: &DatadirTimeline<R>,
req: &PagestreamExistsRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
let tag = RelishTag::Relation(req.rel);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
let exists = timeline.get_rel_exists(tag, lsn)?;
let exists = timeline.get_rel_exists(req.rel, lsn)?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
exists,
}))
}
fn handle_get_nblocks_request<T: Timeline>(
fn handle_get_nblocks_request<R: Repository>(
&self,
timeline: &T,
timeline: &DatadirTimeline<R>,
req: &PagestreamNblocksRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
let tag = RelishTag::Relation(req.rel);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
let n_blocks = timeline.get_relish_size(tag, lsn)?;
// Return 0 if relation is not found.
// This is what postgres smgr expects.
let n_blocks = n_blocks.unwrap_or(0);
let n_blocks = timeline.get_rel_size(req.rel, lsn)?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
}))
}
fn handle_get_page_at_lsn_request<T: Timeline>(
fn handle_get_page_at_lsn_request<R: Repository>(
&self,
timeline: &T,
timeline: &DatadirTimeline<R>,
req: &PagestreamGetPageRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
.entered();
let tag = RelishTag::Relation(req.rel);
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
/*
// Add a 1s delay to some requests. The delayed causes the requests to
@@ -500,7 +495,7 @@ impl PageServerHandler {
std::thread::sleep(std::time::Duration::from_millis(1000));
}
*/
let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
@@ -520,7 +515,7 @@ impl PageServerHandler {
// check that the timeline exists
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
.context("Cannot handle basebackup request for a remote timeline")?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
timeline
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
@@ -699,67 +694,19 @@ impl postgres_backend::Handler for PageServerHandler {
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"layer_relfiles_total"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
RowDescriptor::int8_col(b"layer_relfiles_removed"),
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
RowDescriptor::int8_col(b"layers_total"),
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
RowDescriptor::int8_col(b"layers_needed_by_branches"),
RowDescriptor::int8_col(b"layers_not_updated"),
RowDescriptor::int8_col(b"layers_removed"),
RowDescriptor::int8_col(b"elapsed"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
Some(
result
.ondisk_relfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
result
.ondisk_relfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
Some(
result
.ondisk_relfiles_needed_as_tombstone
.to_string()
.as_bytes(),
),
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
Some(
result
.ondisk_nonrelfiles_needed_by_cutoff
.to_string()
.as_bytes(),
),
Some(
result
.ondisk_nonrelfiles_needed_by_branches
.to_string()
.as_bytes(),
),
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
Some(
result
.ondisk_nonrelfiles_needed_as_tombstone
.to_string()
.as_bytes(),
),
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
Some(result.layers_total.to_string().as_bytes()),
Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
Some(result.layers_needed_by_branches.to_string().as_bytes()),
Some(result.layers_not_updated.to_string().as_bytes()),
Some(result.layers_removed.to_string().as_bytes()),
Some(result.elapsed.as_millis().to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -779,7 +726,7 @@ impl postgres_backend::Handler for PageServerHandler {
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)
.context("Failed to fetch local timeline for checkpoint request")?;
timeline.checkpoint(CheckpointConfig::Forced)?;
timeline.tline.checkpoint(CheckpointConfig::Forced)?;
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {

File diff suppressed because it is too large Load Diff

View File

@@ -641,7 +641,7 @@ mod fs_tests {
}
async fn upload_dummy_file(
harness: &RepoHarness,
harness: &RepoHarness<'_>,
storage: &LocalFs,
name: &str,
) -> anyhow::Result<PathBuf> {

View File

@@ -881,7 +881,7 @@ mod test_utils {
#[track_caller]
pub async fn ensure_correct_timeline_upload(
harness: &RepoHarness,
harness: &RepoHarness<'_>,
remote_assets: Arc<(LocalFs, RwLock<RemoteTimelineIndex>)>,
timeline_id: ZTimelineId,
new_upload: NewCheckpoint,

File diff suppressed because it is too large Load Diff

View File

@@ -2,14 +2,15 @@
//! page server.
use crate::branches;
use crate::{RepositoryImpl, TimelineImpl};
use crate::config::PageServerConf;
use crate::layered_repository::LayeredRepository;
use crate::repository::{Repository, TimelineSyncState};
use crate::repository::Repository;
use crate::repository::TimelineSyncState;
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
use crate::walredo::PostgresRedoManager;
use crate::CheckpointConfig;
use crate::{DatadirTimelineImpl, RepositoryImpl};
use anyhow::{bail, Context, Result};
use lazy_static::lazy_static;
use log::*;
@@ -26,6 +27,8 @@ lazy_static! {
struct Tenant {
state: TenantState,
repo: Arc<RepositoryImpl>,
timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -79,15 +82,17 @@ pub fn set_timeline_states(
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
// Set up an object repository, for actual data storage.
let repo: Arc<RepositoryImpl> = Arc::new(LayeredRepository::new(
let repo = LayeredRepository::new(
conf,
Arc::new(walredo_mgr),
tenant_id,
conf.remote_storage_config.is_some(),
));
);
Tenant {
state: TenantState::Idle,
repo,
repo: Arc::new(repo),
timelines: HashMap::new(),
}
});
if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) {
@@ -191,6 +196,7 @@ pub fn create_repository_for_tenant(
v.insert(Tenant {
state: TenantState::Idle,
repo,
timelines: HashMap::new(),
});
}
}
@@ -261,11 +267,25 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryIm
pub fn get_timeline_for_tenant(
tenantid: ZTenantId,
timelineid: ZTimelineId,
) -> Result<Arc<TimelineImpl>> {
get_repository_for_tenant(tenantid)?
) -> Result<Arc<DatadirTimelineImpl>> {
let mut m = access_tenants();
let tenant = m
.get_mut(&tenantid)
.with_context(|| format!("Tenant not found for tenant {}", tenantid))?;
if let Some(page_tline) = tenant.timelines.get(&timelineid) {
return Ok(Arc::clone(page_tline));
}
// First access to this timeline. Create a DatadirTimeline wrapper for it
let tline = tenant
.repo
.get_timeline(timelineid)?
.local_timeline()
.with_context(|| format!("cannot fetch timeline {}", timelineid))
.with_context(|| format!("cannot fetch timeline {}", timelineid))?;
let page_tline = Arc::new(DatadirTimelineImpl::new(tline));
tenant.timelines.insert(timelineid, Arc::clone(&page_tline));
Ok(page_tline)
}
#[derive(Serialize, Deserialize, Clone)]

View File

@@ -14,6 +14,15 @@ use zenith_utils::zid::ZTenantId;
/// Checkpointer thread's main loop
///
pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
if let Err(err) = checkpoint_loop_ext(tenantid, conf) {
error!("checkpoint loop terminated with error: {:?}", err);
Err(err)
} else {
Ok(())
}
}
fn checkpoint_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
loop {
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
break;

File diff suppressed because it is too large Load Diff

View File

@@ -7,7 +7,6 @@
use crate::config::PageServerConf;
use crate::repository::Repository;
use crate::repository::Timeline;
use crate::tenant_mgr;
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
@@ -255,8 +254,7 @@ fn walreceiver_main(
// at risk of hitting a deadlock.
assert!(lsn.is_aligned());
let writer = timeline.writer();
walingest.ingest_record(&*timeline, writer.as_ref(), recdata, lsn)?;
walingest.ingest_record(&timeline, recdata, lsn)?;
fail_point!("walreceiver-after-ingest");

View File

@@ -10,7 +10,47 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, Transacti
use serde::{Deserialize, Serialize};
use tracing::*;
use crate::repository::ZenithWalRecord;
/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper
/// around a PostgreSQL WAL record, or a custom zenith-specific "record".
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum ZenithWalRecord {
/// Native PostgreSQL WAL record
Postgres { will_init: bool, rec: Bytes },
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
ClearVisibilityMapFlags {
new_heap_blkno: Option<u32>,
old_heap_blkno: Option<u32>,
flags: u8,
},
/// Mark transaction IDs as committed on a CLOG page
ClogSetCommitted { xids: Vec<TransactionId> },
/// Mark transaction IDs as aborted on a CLOG page
ClogSetAborted { xids: Vec<TransactionId> },
/// Extend multixact offsets SLRU
MultixactOffsetCreate {
mid: MultiXactId,
moff: MultiXactOffset,
},
/// Extend multixact members SLRU.
MultixactMembersCreate {
moff: MultiXactOffset,
members: Vec<MultiXactMember>,
},
}
impl ZenithWalRecord {
/// Does replaying this WAL record initialize the page from scratch, or does
/// it need to be applied over the previous image of the page?
pub fn will_init(&self) -> bool {
match self {
ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init,
// None of the special zenith record types currently initialize the page
_ => false,
}
}
}
/// DecodedBkpBlock represents per-page data contained in a WAL record.
#[derive(Default)]
@@ -87,6 +127,28 @@ impl XlRelmapUpdate {
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlSmgrCreate {
pub rnode: RelFileNode,
// FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have
// well-defined size?
pub forknum: u8,
}
impl XlSmgrCreate {
pub fn decode(buf: &mut Bytes) -> XlSmgrCreate {
XlSmgrCreate {
rnode: RelFileNode {
spcnode: buf.get_u32_le(), /* tablespace */
dbnode: buf.get_u32_le(), /* database */
relnode: buf.get_u32_le(), /* relation */
},
forknum: buf.get_u32_le() as u8,
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlSmgrTruncate {

View File

@@ -42,8 +42,10 @@ use zenith_utils::nonblock::set_nonblock;
use zenith_utils::zid::ZTenantId;
use crate::config::PageServerConf;
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::relish::*;
use crate::repository::ZenithWalRecord;
use crate::repository::Key;
use crate::walrecord::ZenithWalRecord;
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
@@ -75,8 +77,7 @@ pub trait WalRedoManager: Send + Sync {
/// the reords.
fn request_redo(
&self,
rel: RelishTag,
blknum: u32,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<(Lsn, ZenithWalRecord)>,
@@ -92,8 +93,7 @@ pub struct DummyRedoManager {}
impl crate::walredo::WalRedoManager for DummyRedoManager {
fn request_redo(
&self,
_rel: RelishTag,
_blknum: u32,
_key: Key,
_lsn: Lsn,
_base_img: Option<Bytes>,
_records: Vec<(Lsn, ZenithWalRecord)>,
@@ -152,28 +152,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool {
}
}
fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool {
if let RelishTag::Relation(RelTag {
forknum,
spcnode: _,
dbnode: _,
relnode: _,
}) = rel
{
*forknum == expected_forknum
} else {
false
}
}
fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool {
if let RelishTag::Slru { slru, segno } = rel {
*slru == expected_slru && *segno == expected_segno
} else {
false
}
}
/// An error happened in WAL redo
#[derive(Debug, thiserror::Error)]
pub enum WalRedoError {
@@ -184,6 +162,8 @@ pub enum WalRedoError {
InvalidState,
#[error("cannot perform WAL redo for this request")]
InvalidRequest,
#[error("cannot perform WAL redo for this record")]
InvalidRecord,
}
///
@@ -198,8 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
///
fn request_redo(
&self,
rel: RelishTag,
blknum: u32,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<(Lsn, ZenithWalRecord)>,
@@ -217,11 +196,10 @@ impl WalRedoManager for PostgresRedoManager {
if rec_zenith != batch_zenith {
let result = if batch_zenith {
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i])
self.apply_batch_zenith(key, lsn, img, &records[batch_start..i])
} else {
self.apply_batch_postgres(
rel,
blknum,
key,
lsn,
img,
&records[batch_start..i],
@@ -236,11 +214,10 @@ impl WalRedoManager for PostgresRedoManager {
}
// last batch
if batch_zenith {
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..])
self.apply_batch_zenith(key, lsn, img, &records[batch_start..])
} else {
self.apply_batch_postgres(
rel,
blknum,
key,
lsn,
img,
&records[batch_start..],
@@ -268,16 +245,15 @@ impl PostgresRedoManager {
///
fn apply_batch_postgres(
&self,
rel: RelishTag,
blknum: u32,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
records: &[(Lsn, ZenithWalRecord)],
wal_redo_timeout: Duration,
) -> Result<Bytes, WalRedoError> {
let start_time = Instant::now();
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
let apply_result: Result<Bytes, Error>;
let start_time = Instant::now();
let mut process_guard = self.process.lock().unwrap();
let lock_time = Instant::now();
@@ -291,16 +267,11 @@ impl PostgresRedoManager {
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
let result = if let RelishTag::Relation(rel) = rel {
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout);
apply_result.map_err(WalRedoError::IoError)
} else {
error!("unexpected non-relation relish: {:?}", rel);
Err(WalRedoError::InvalidRequest)
};
// Relational WAL records are applied using wal-redo-postgres
let buf_tag = BufferTag { rel, blknum };
let result = process
.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
.map_err(WalRedoError::IoError);
let end_time = Instant::now();
let duration = end_time.duration_since(lock_time);
@@ -326,8 +297,7 @@ impl PostgresRedoManager {
///
fn apply_batch_zenith(
&self,
rel: RelishTag,
blknum: u32,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
records: &[(Lsn, ZenithWalRecord)],
@@ -346,7 +316,7 @@ impl PostgresRedoManager {
// Apply all the WAL records in the batch
for (record_lsn, record) in records.iter() {
self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?;
self.apply_record_zenith(key, &mut page, *record_lsn, record)?;
}
// Success!
let end_time = Instant::now();
@@ -365,8 +335,7 @@ impl PostgresRedoManager {
fn apply_record_zenith(
&self,
rel: RelishTag,
blknum: u32,
key: Key,
page: &mut BytesMut,
_record_lsn: Lsn,
record: &ZenithWalRecord,
@@ -382,9 +351,10 @@ impl PostgresRedoManager {
flags,
} => {
// sanity check that this is modifying the correct relish
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
assert!(
check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM),
"ClearVisibilityMapFlags record on unexpected rel {:?}",
rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM,
"ClearVisibilityMapFlags record on unexpected rel {}",
rel
);
if let Some(heap_blkno) = *new_heap_blkno {
@@ -418,6 +388,14 @@ impl PostgresRedoManager {
// Non-relational WAL records are handled here, with custom code that has the
// same effects as the corresponding Postgres WAL redo function.
ZenithWalRecord::ClogSetCommitted { xids } => {
let (slru_kind, segno, blknum) =
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
assert_eq!(
slru_kind,
SlruKind::Clog,
"ClogSetCommitted record with unexpected key {}",
key
);
for &xid in xids {
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -425,12 +403,17 @@ impl PostgresRedoManager {
// Check that we're modifying the correct CLOG block.
assert!(
check_slru_segno(&rel, SlruKind::Clog, expected_segno),
"ClogSetCommitted record for XID {} with unexpected rel {:?}",
segno == expected_segno,
"ClogSetCommitted record for XID {} with unexpected key {}",
xid,
rel
key
);
assert!(
blknum == expected_blknum,
"ClogSetCommitted record for XID {} with unexpected key {}",
xid,
key
);
assert!(blknum == expected_blknum);
transaction_id_set_status(
xid,
@@ -440,6 +423,14 @@ impl PostgresRedoManager {
}
}
ZenithWalRecord::ClogSetAborted { xids } => {
let (slru_kind, segno, blknum) =
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
assert_eq!(
slru_kind,
SlruKind::Clog,
"ClogSetAborted record with unexpected key {}",
key
);
for &xid in xids {
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -447,17 +438,30 @@ impl PostgresRedoManager {
// Check that we're modifying the correct CLOG block.
assert!(
check_slru_segno(&rel, SlruKind::Clog, expected_segno),
"ClogSetCommitted record for XID {} with unexpected rel {:?}",
segno == expected_segno,
"ClogSetAborted record for XID {} with unexpected key {}",
xid,
rel
key
);
assert!(
blknum == expected_blknum,
"ClogSetAborted record for XID {} with unexpected key {}",
xid,
key
);
assert!(blknum == expected_blknum);
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
}
}
ZenithWalRecord::MultixactOffsetCreate { mid, moff } => {
let (slru_kind, segno, blknum) =
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
assert_eq!(
slru_kind,
SlruKind::MultiXactOffsets,
"MultixactOffsetCreate record with unexpected key {}",
key
);
// Compute the block and offset to modify.
// See RecordNewMultiXact in PostgreSQL sources.
let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
@@ -468,16 +472,29 @@ impl PostgresRedoManager {
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
assert!(
check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno),
"MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}",
segno == expected_segno,
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
mid,
rel
key
);
assert!(
blknum == expected_blknum,
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
mid,
key
);
assert!(blknum == expected_blknum);
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
}
ZenithWalRecord::MultixactMembersCreate { moff, members } => {
let (slru_kind, segno, blknum) =
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
assert_eq!(
slru_kind,
SlruKind::MultiXactMembers,
"MultixactMembersCreate record with unexpected key {}",
key
);
for (i, member) in members.iter().enumerate() {
let offset = moff + i as u32;
@@ -492,12 +509,17 @@ impl PostgresRedoManager {
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
assert!(
check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno),
"MultiXactMembersCreate record at offset {} with unexpected rel {:?}",
segno == expected_segno,
"MultiXactMembersCreate record for offset {} with unexpected key {}",
moff,
rel
key
);
assert!(
blknum == expected_blknum,
"MultiXactMembersCreate record for offset {} with unexpected key {}",
moff,
key
);
assert!(blknum == expected_blknum);
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);

View File

@@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2;
pub const INIT_FORKNUM: u8 = 3;
// From storage_xlog.h
pub const XLOG_SMGR_CREATE: u8 = 0x10;
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
@@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
// From pg_control.h and rmgrlist.h
pub const XLOG_NEXTOID: u8 = 0x30;
pub const XLOG_SWITCH: u8 = 0x40;
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
pub const XLOG_FPI: u8 = 0xB0;
pub const DB_SHUTDOWNED: u32 = 1;

View File

@@ -74,8 +74,5 @@ def lsn_from_hex(lsn_hex: str) -> int:
def print_gc_result(row):
log.info("GC duration {elapsed} ms".format_map(row))
log.info(
" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
.format_map(row))
log.info(
" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
.format_map(row))