mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-17 10:22:56 +00:00
Compare commits
20 Commits
persistent
...
layered-re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
534fd33e3d | ||
|
|
6b2e7d9499 | ||
|
|
ef5f3eb514 | ||
|
|
be1386a555 | ||
|
|
aad4d1da85 | ||
|
|
fbff37a64c | ||
|
|
2b080b49c4 | ||
|
|
dd63b81539 | ||
|
|
f8e533bbdf | ||
|
|
8d0086f749 | ||
|
|
d285898c73 | ||
|
|
61761bf1ce | ||
|
|
0b2ed17f86 | ||
|
|
df8e3e1695 | ||
|
|
99f3775d68 | ||
|
|
8f81ac064e | ||
|
|
3b84975ca9 | ||
|
|
df3e403967 | ||
|
|
4f7b22a8a8 | ||
|
|
3a3e48059c |
@@ -7,7 +7,7 @@ executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.51.0
|
||||
- image: cimg/rust:1.52.0
|
||||
|
||||
jobs:
|
||||
|
||||
|
||||
99
Cargo.lock
generated
99
Cargo.lock
generated
@@ -1,7 +1,5 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.4.7"
|
||||
@@ -82,6 +80,30 @@ version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
|
||||
|
||||
[[package]]
|
||||
name = "aversion"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25b49482974b90e9f36c5adcc50acde2e27e806ac269ff32758d700432782bc0"
|
||||
dependencies = [
|
||||
"aversion-macros",
|
||||
"byteorder",
|
||||
"serde",
|
||||
"serde_cbor",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aversion-macros"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed3009cf133dbd82459e96cf46bb24c8e6ad5c02c387ddb21d0f2c4c781a5394"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-creds"
|
||||
version = "0.26.0"
|
||||
@@ -160,6 +182,18 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bookfile"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7db391acd99b8bdce5d5a66ca28530761affec9a407df91aee668fc318e3db71"
|
||||
dependencies = [
|
||||
"aversion",
|
||||
"byteorder",
|
||||
"serde",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "boxfnonce"
|
||||
version = "0.1.1"
|
||||
@@ -640,6 +674,12 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.9.1"
|
||||
@@ -1108,6 +1148,7 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bookfile",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
@@ -1232,24 +1273,6 @@ dependencies = [
|
||||
"tokio-postgres 0.7.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"hmac",
|
||||
"md-5",
|
||||
"memchr",
|
||||
"rand",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.1"
|
||||
@@ -1269,14 +1292,21 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.1"
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
|
||||
checksum = "ff3e0f70d32e20923cabf2df02913be7c1842d4c772db8065c00fcfdd1d1bff3"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"hmac",
|
||||
"md-5",
|
||||
"memchr",
|
||||
"rand",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1289,6 +1319,17 @@ dependencies = [
|
||||
"postgres-protocol 0.6.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "430f4131e1b7657b0cd9a2b0c3408d77c9a43a042d300b8c77f981dffcc43a2f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
@@ -1654,6 +1695,16 @@ dependencies = [
|
||||
"xml-rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_cbor"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622"
|
||||
dependencies = [
|
||||
"half",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.126"
|
||||
|
||||
@@ -7,6 +7,7 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
bookfile = "^0.2"
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::object_key::{DatabaseTag, ObjectTag};
|
||||
use crate::relish::*;
|
||||
use crate::repository::Timeline;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
@@ -32,9 +32,6 @@ pub struct Basebackup<'a> {
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
slru_buf: [u8; pg_constants::SLRU_SEG_SIZE],
|
||||
slru_segno: u32,
|
||||
slru_path: &'static str,
|
||||
}
|
||||
|
||||
impl<'a> Basebackup<'a> {
|
||||
@@ -49,9 +46,6 @@ impl<'a> Basebackup<'a> {
|
||||
timeline,
|
||||
lsn,
|
||||
prev_record_lsn,
|
||||
slru_path: "",
|
||||
slru_segno: u32::MAX,
|
||||
slru_buf: [0u8; pg_constants::SLRU_SEG_SIZE],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -82,21 +76,19 @@ impl<'a> Basebackup<'a> {
|
||||
// It allows to easily construct SLRU segments.
|
||||
for obj in self.timeline.list_nonrels(self.lsn)? {
|
||||
match obj {
|
||||
ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
|
||||
ObjectTag::MultiXactMembers(slru) => {
|
||||
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
self.add_slru_segment(slru, segno)?;
|
||||
}
|
||||
ObjectTag::MultiXactOffsets(slru) => {
|
||||
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
self.add_relmap_file(spcnode, dbnode)?;
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => {
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
|
||||
ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// write last non-completed SLRU segment (if any)
|
||||
self.finish_slru_segment()?;
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
@@ -107,45 +99,33 @@ impl<'a> Basebackup<'a> {
|
||||
//
|
||||
// Generate SLRU segment files from repository. Path identifies SLRU kind (pg_xact, pg_multixact/members, ...).
|
||||
//
|
||||
fn add_slru_segment(
|
||||
&mut self,
|
||||
path: &'static str,
|
||||
tag: &ObjectTag,
|
||||
blknum: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
|
||||
// Zero length image indicates truncated segment: just skip it
|
||||
if !img.is_empty() {
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
let segno = blknum / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if self.slru_path != "" && (self.slru_segno != segno || self.slru_path != path) {
|
||||
// Switch to new segment: save old one
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
self.slru_buf = [0u8; pg_constants::SLRU_SEG_SIZE]; // reinitialize segment buffer
|
||||
}
|
||||
self.slru_segno = segno;
|
||||
self.slru_path = path;
|
||||
let offs_start = (blknum % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
|
||||
* pg_constants::BLCKSZ as usize;
|
||||
let offs_end = offs_start + pg_constants::BLCKSZ as usize;
|
||||
self.slru_buf[offs_start..offs_end].copy_from_slice(&img);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(RelishTag::Slru { slru, segno }, self.lsn)?;
|
||||
|
||||
//
|
||||
// We flush SLRU segments to the tarball once them are completed.
|
||||
// This method is used to flush last (may be incompleted) segment.
|
||||
//
|
||||
fn finish_slru_segment(&mut self) -> anyhow::Result<()> {
|
||||
if self.slru_path != "" {
|
||||
// is there is some incompleted segment
|
||||
let segname = format!("{}/{:>04X}", self.slru_path, self.slru_segno);
|
||||
let header = new_tar_header(&segname, pg_constants::SLRU_SEG_SIZE as u64)?;
|
||||
self.ar.append(&header, &self.slru_buf[..])?;
|
||||
let mut slru_buf: Vec<u8> =
|
||||
Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = self.timeline.get_page_at_lsn_nowait(
|
||||
RelishTag::Slru { slru, segno },
|
||||
blknum,
|
||||
self.lsn,
|
||||
)?;
|
||||
assert!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
slru_buf.extend_from_slice(&img);
|
||||
}
|
||||
|
||||
let dir = match slru {
|
||||
SlruKind::Clog => "pg_xact",
|
||||
SlruKind::MultiXactMembers => "pg_multixact/members",
|
||||
SlruKind::MultiXactOffsets => "pg_multixact/offsets",
|
||||
};
|
||||
|
||||
let segname = format!("{}/{:>04X}", dir, segno);
|
||||
let header = new_tar_header(&segname, slru_buf.len() as u64)?;
|
||||
self.ar.append(&header, slru_buf.as_slice())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -153,10 +133,13 @@ impl<'a> Basebackup<'a> {
|
||||
// Extract pg_filenode.map files from repository
|
||||
// Along with them also send PG_VERSION for each database.
|
||||
//
|
||||
fn add_relmap_file(&mut self, tag: &ObjectTag, db: &DatabaseTag) -> anyhow::Result<()> {
|
||||
trace!("add_relmap_file {:?}", db);
|
||||
let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
|
||||
let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn_nowait(
|
||||
RelishTag::FileNodeMap { spcnode, dbnode },
|
||||
0,
|
||||
self.lsn,
|
||||
)?;
|
||||
let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
let dst_path = "PG_VERSION";
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
@@ -169,19 +152,19 @@ impl<'a> Basebackup<'a> {
|
||||
String::from("global/pg_filenode.map") // filenode map for global tablespace
|
||||
} else {
|
||||
// User defined tablespaces are not supported
|
||||
assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
|
||||
// Append dir path for each database
|
||||
let path = format!("base/{}", db.dbnode);
|
||||
let path = format!("base/{}", dbnode);
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
|
||||
let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &version_bytes[..])?;
|
||||
|
||||
format!("base/{}/pg_filenode.map", db.dbnode)
|
||||
format!("base/{}/pg_filenode.map", dbnode)
|
||||
};
|
||||
assert!(img.len() == 512);
|
||||
let header = new_tar_header(&path, img.len() as u64)?;
|
||||
@@ -192,12 +175,14 @@ impl<'a> Basebackup<'a> {
|
||||
//
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, tag: &ObjectTag, xid: TransactionId) -> anyhow::Result<()> {
|
||||
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
// Include in tarball two-phase files only of in-progress transactions
|
||||
if self.timeline.get_tx_status(xid, self.lsn)?
|
||||
== pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
let img = self.timeline.get_page_at_lsn_nowait(*tag, self.lsn)?;
|
||||
let img =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
let crc = crc32c::crc32c(&img[..]);
|
||||
@@ -214,12 +199,12 @@ impl<'a> Basebackup<'a> {
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(ObjectTag::Checkpoint, self.lsn)?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn_nowait(ObjectTag::ControlFile, self.lsn)?;
|
||||
let checkpoint_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(RelishTag::Checkpoint, 0, self.lsn)?;
|
||||
let pg_control_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn_nowait(RelishTag::ControlFile, 0, self.lsn)?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
@@ -262,6 +247,7 @@ impl<'a> Basebackup<'a> {
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(&pg_control);
|
||||
assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -17,7 +17,8 @@ use anyhow::Result;
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{branches, logger, page_cache, page_service, PageServerConf};
|
||||
use pageserver::{branches, logger, page_cache, page_service};
|
||||
use pageserver::{PageServerConf, RepositoryFormat};
|
||||
|
||||
const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:64000";
|
||||
|
||||
@@ -33,6 +34,7 @@ struct CfgFileParams {
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
repository_format: Option<String>,
|
||||
}
|
||||
|
||||
impl CfgFileParams {
|
||||
@@ -47,6 +49,7 @@ impl CfgFileParams {
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
repository_format: get_arg("repository-format"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,6 +61,7 @@ impl CfgFileParams {
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
repository_format: self.repository_format.or(other.repository_format),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,6 +90,21 @@ impl CfgFileParams {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
//
|
||||
// FIXME: This pageserver --repository-format option is pretty useless as it
|
||||
// isn't exposed as an option to "zenith init". But you can change the default
|
||||
// here if you want to test the rocksdb implementation:
|
||||
//
|
||||
let repository_format = match self.repository_format.as_ref() {
|
||||
Some(repo_format_str) if repo_format_str == "rocksdb" => RepositoryFormat::RocksDb,
|
||||
Some(repo_format_str) if repo_format_str == "layered" => RepositoryFormat::Layered,
|
||||
Some(repo_format_str) => anyhow::bail!(
|
||||
"invalid --repository-format '{}', must be 'rocksdb' or 'layered'",
|
||||
repo_format_str
|
||||
),
|
||||
None => RepositoryFormat::Layered, // default
|
||||
};
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
|
||||
@@ -98,6 +117,7 @@ impl CfgFileParams {
|
||||
workdir: PathBuf::from("."),
|
||||
|
||||
pg_distrib_dir,
|
||||
repository_format,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -157,6 +177,12 @@ fn main() -> Result<()> {
|
||||
.help("Create tenant during init")
|
||||
.requires("init"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("repository-format")
|
||||
.long("repository-format")
|
||||
.takes_value(true)
|
||||
.help("Which repository implementation to use, 'rocksdb' or 'layered'"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::ZTenantId;
|
||||
use crate::{repository::Repository, PageServerConf, ZTimelineId};
|
||||
use crate::{repository::Repository, PageServerConf, RepositoryFormat, ZTimelineId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BranchInfo {
|
||||
@@ -73,8 +73,8 @@ pub fn init_pageserver(
|
||||
pub fn create_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
wal_redo_manager: Arc<dyn WalRedoManager>,
|
||||
) -> Result<ObjectRepository> {
|
||||
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
) -> Result<Arc<dyn Repository>> {
|
||||
let repo_dir = conf.tenant_path(&tenantid);
|
||||
if repo_dir.exists() {
|
||||
bail!("repo for {} already exists", tenantid)
|
||||
@@ -104,19 +104,28 @@ pub fn create_repo(
|
||||
// and we failed to run initdb again in the same directory. This has been solved for the
|
||||
// rapid init+start case now, but the general race condition remains if you restart the
|
||||
// server quickly.
|
||||
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
|
||||
let repo: Arc<dyn Repository + Sync + Send> = match conf.repository_format {
|
||||
RepositoryFormat::Layered => Arc::new(
|
||||
crate::layered_repository::LayeredRepository::new(conf,
|
||||
wal_redo_manager,
|
||||
tenantid
|
||||
)),
|
||||
RepositoryFormat::RocksDb => {
|
||||
let obj_store = crate::rocksdb_storage::RocksObjectStore::create(conf, &tenantid)?;
|
||||
|
||||
let repo = crate::object_repository::ObjectRepository::new(
|
||||
conf,
|
||||
std::sync::Arc::new(storage),
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
);
|
||||
Arc::new(ObjectRepository::new(
|
||||
conf,
|
||||
Arc::new(obj_store),
|
||||
wal_redo_manager,
|
||||
tenantid
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, tli, &repo)?;
|
||||
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -181,6 +190,7 @@ fn bootstrap_timeline(
|
||||
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
timeline.checkpoint()?;
|
||||
|
||||
let wal_dir = pgdata_path.join("pg_wal");
|
||||
restore_local_repo::import_timeline_wal(&wal_dir, &*timeline, timeline.get_last_record_lsn())?;
|
||||
|
||||
1124
pageserver/src/layered_repository.rs
Normal file
1124
pageserver/src/layered_repository.rs
Normal file
File diff suppressed because it is too large
Load Diff
274
pageserver/src/layered_repository/README.md
Normal file
274
pageserver/src/layered_repository/README.md
Normal file
@@ -0,0 +1,274 @@
|
||||
# Overview
|
||||
|
||||
The on-disk format is based on immutable files. The page server
|
||||
receives a stream of incoming WAL, parses the WAL records to determine
|
||||
which pages they apply to, and accumulates the incoming changes in
|
||||
memory. Every now and then, the accumulated changes are written out to
|
||||
new files.
|
||||
|
||||
The files are called "snapshot files". Each snapshot file corresponds
|
||||
to one PostgreSQL relation fork. The snapshot files for each timeline
|
||||
are stored in the timeline's subdirectory under .zenith/timelines.
|
||||
|
||||
The files are named like this:
|
||||
|
||||
rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
|
||||
|
||||
For example:
|
||||
|
||||
rel_1663_13990_2609_0_000000000169C348_0000000001702000
|
||||
|
||||
Each snapshot file contains a full snapshot, that is, full copy of all
|
||||
pages in the relation, as of the "start LSN". It also contains all WAL
|
||||
records applicable to the relation between the start and end
|
||||
LSNs. With this information, the page server can reconstruct any page
|
||||
version of the relation in the LSN range.
|
||||
|
||||
If a file has been dropped, the last snapshot file for it is created
|
||||
with the _DROPPED suffix, e.g.
|
||||
|
||||
rel_1663_13990_2609_0_000000000169C348_0000000001702000_DROPPED
|
||||
|
||||
In addition to the relations, with "rel_*" prefix, we use the same
|
||||
format for storing various smaller files from the PostgreSQL data
|
||||
directory. They will use different suffixes and the naming scheme
|
||||
up to the LSN range varies. The Zenith source code uses the term
|
||||
"relish" to mean "a relation, or other file that's treated like a
|
||||
relation in the storage"
|
||||
|
||||
## Notation used in this document
|
||||
|
||||
The full path of a snapshot file looks like this:
|
||||
|
||||
.zenith/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_000000000169C348_0000000001702000
|
||||
|
||||
For simplicity, the examples below use a simplified notation for the
|
||||
paths. The timeline ID is replaced with the human-readable branch
|
||||
name, and spcnode+dbnode+relnode+forkum with a human-readable table
|
||||
name. The LSNs are also shorter. For example, a snapshot file for
|
||||
'orders' table on 'main' branch, with LSN range 100-200 would be:
|
||||
|
||||
main/orders_100_200
|
||||
|
||||
|
||||
# Creating snapshot files
|
||||
|
||||
Let's start with a simple example with a system that contains one
|
||||
branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||
of WAL is currently at LSN 250. In this starting situation, you would
|
||||
have two files on disk:
|
||||
|
||||
main/orders_100_200
|
||||
main/customers_100_200
|
||||
|
||||
In addition to those files, the recent changes between LSN 200 and the
|
||||
end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||
latest records between 200-250 need to be re-read from the WAL.
|
||||
|
||||
Whenever enough WAL has been accumulated in memory, the page server
|
||||
writes out the changes in memory into new snapshot files. This process
|
||||
is called "checkpointing" (not to be confused with the PostgreSQL
|
||||
checkpoints, that's a different thing). The page server only creates
|
||||
snapshot files for relations that have been modified since the last
|
||||
checkpoint. For example, if the current end of WAL is at LSN 450, and
|
||||
the last checkpoint happened at LSN 400 but there hasn't been any
|
||||
recent changes to 'customers' table, you would have these files on
|
||||
disk:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
main/customers_100_200
|
||||
|
||||
If the customers table is modified later, a new file is created for it
|
||||
at the next checkpoint. The new file will cover the "gap" from the
|
||||
last snapshot file, so the LSN ranges are always contiguous:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
main/customers_100_200
|
||||
main/customers_200_500
|
||||
|
||||
## Reading page versions
|
||||
|
||||
Whenever a GetPage@LSN request comes in from the compute node, the
|
||||
page server needs to reconstruct the requested page, as it was at the
|
||||
requested LSN. To do that, the page server first checks the recent
|
||||
in-memory layer; if the requested page version is found there, it can
|
||||
be returned immediatedly without looking at the files on
|
||||
disk. Otherwise the page server needs to locate the snapshot file that
|
||||
contains the requested page version.
|
||||
|
||||
For example, if a request comes in for table 'orders' at LSN 250, the
|
||||
page server would load the 'main/orders_200_300' file into memory, and
|
||||
reconstruct and return the requested page from it, as it was at
|
||||
LSN 250. Because the snapshot file consists of a full image of the
|
||||
relation at the start LSN and the WAL, reconstructing the page
|
||||
involves replaying any WAL records applicable to the page between LSNs
|
||||
200-250, starting from the base image at LSN 200.
|
||||
|
||||
|
||||
# Multiple branches
|
||||
|
||||
Imagine that a child branch is created at LSN 250:
|
||||
|
||||
@250
|
||||
----main--+-------------------------->
|
||||
\
|
||||
+---child-------------->
|
||||
|
||||
|
||||
Then, the 'orders' table is updated differently on the 'main' and
|
||||
'child' branches. You now have this situation on disk:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
main/customers_100_200
|
||||
child/orders_250_300
|
||||
child/orders_300_400
|
||||
|
||||
Because the 'customers' table hasn't been modified on the child
|
||||
branch, there is no file for it there. If you request a page for it on
|
||||
the 'child' branch, the page server will not find any snapshot file
|
||||
for it in the 'child' directory, so it will recurse to look into the
|
||||
parent 'main' branch instead.
|
||||
|
||||
From the 'child' branch's point of view, the history for each relation
|
||||
is linear, and the request's LSN identifies unambiguously which file
|
||||
you need to look at. For example, the history for the 'orders' table
|
||||
on the 'main' branch consists of these files:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
|
||||
And from the 'child' branch's point of view, it consists of these
|
||||
files:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
child/orders_250_300
|
||||
child/orders_300_400
|
||||
|
||||
The branch metadata includes the point where the child branch was
|
||||
created, LSN 250. If a page request comes with LSN 275, we read the
|
||||
page version from the 'child/orders_250_300' file. If the request LSN
|
||||
is 225, we read it from the 'main/orders_200_300' file instead. The
|
||||
page versions between 250-300 in the 'main/orders_200_300' file are
|
||||
ignored when operating on the child branch.
|
||||
|
||||
Note: It doesn't make any difference if the child branch is created
|
||||
when the end of the main branch was at LSN 250, or later when the tip of
|
||||
the main branch had already moved on. The latter case, creating a
|
||||
branch at a historic LSN, is how we support PITR in Zenith.
|
||||
|
||||
|
||||
# Garbage collection
|
||||
|
||||
In this scheme, we keep creating new snapshot files over time. We also
|
||||
need a mechanism to remove old files that are no longer needed,
|
||||
because disk space isn't infinite.
|
||||
|
||||
What files are still needed? Currently, the page server supports PITR
|
||||
and branching from any branch at any LSN that is "recent enough" from
|
||||
the tip of the branch. "Recent enough" is defined as an LSN horizon,
|
||||
which by default is 64 MB. (See DEFAULT_GC_HORIZON). For this
|
||||
example, let's assume that the LSN horizon is 150 units.
|
||||
|
||||
Let's look at the single branch scenario again. Imagine that the end
|
||||
of the branch is LSN 525, so that the GC horizon is currently at
|
||||
525-150 = 375
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
main/orders_400_500
|
||||
main/customers_100_200
|
||||
|
||||
We can remove files 'main/orders_100_200' and 'main/orders_200_300',
|
||||
because the end LSNs of those files are older than GC horizon 375, and
|
||||
there are more recent snapshot files for the table. 'main/orders_300_400'
|
||||
and 'main/orders_400_500' are still within the horizon, so they must be
|
||||
retained. 'main/customers_100_200' is old enough, but it cannot be
|
||||
removed because there is no newer snapshot file for the table.
|
||||
|
||||
Things get slightly more complicated with multiple branches. All of
|
||||
the above still holds, but in addition to recent files we must also
|
||||
retain older shapshot files that are still needed by child branches.
|
||||
For example, if child branch is created at LSN 150, and the 'customers'
|
||||
table is updated on the branch, you would have these files:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
main/orders_400_500
|
||||
main/customers_100_200
|
||||
child/customers_150_300
|
||||
|
||||
In this situation, the 'main/orders_100_200' file cannot be removed,
|
||||
even though it is older than the GC horizon, because it is still
|
||||
needed by the child branch. 'main/orders_200_300' can still be
|
||||
removed. So after garbage collection, these files would remain:
|
||||
|
||||
main/orders_100_200
|
||||
|
||||
main/orders_300_400
|
||||
main/orders_400_500
|
||||
main/customers_100_200
|
||||
child/customers_150_300
|
||||
|
||||
If 'orders' is modified later on the 'child' branch, we will create a
|
||||
snapshot file for it on the child:
|
||||
|
||||
main/orders_100_200
|
||||
|
||||
main/orders_300_400
|
||||
main/orders_400_500
|
||||
main/customers_100_200
|
||||
child/customers_150_300
|
||||
child/orders_150_400
|
||||
|
||||
After this, the 'main/orders_100_200' file can be removed. It is no
|
||||
longer needed by the child branch, because there is a newer snapshot
|
||||
file there. TODO: This optimization hasn't been implemented! The GC
|
||||
algorithm will curently keep the file on the 'main' branch anyway, for
|
||||
as long as the child branch exists.
|
||||
|
||||
|
||||
# On LSN ranges
|
||||
|
||||
In principle, each relation can be checkpointed separately, i.e. the
|
||||
LSN ranges of the files don't need to line up. So this would be legal:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
main/customers_150_250
|
||||
main/customers_250_500
|
||||
|
||||
However, the code currently always checkpoints all relations together.
|
||||
So that situation doesn't arise in practice.
|
||||
|
||||
It would also be OK to have overlapping LSN ranges for the same relation:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_250_350
|
||||
main/orders_300_400
|
||||
|
||||
The code that reads the snapshot files should cope with this, but this
|
||||
situation doesn't arise either, because the checkpointing code never
|
||||
does that. It could be useful, however, as a transient state when
|
||||
garbage collecting around branch points, or explicit recovery
|
||||
points. For example, if we start with this:
|
||||
|
||||
main/orders_100_200
|
||||
main/orders_200_300
|
||||
main/orders_300_400
|
||||
|
||||
And there is a branch or explicit recovery point at LSN 150, we could
|
||||
replace 'main/orders_100_200' with 'main/orders_150_150' to keep a
|
||||
snapshot only at that exact point that's still needed, removing the
|
||||
other page versions around it.
|
||||
496
pageserver/src/layered_repository/inmemory_layer.rs
Normal file
496
pageserver/src/layered_repository/inmemory_layer.rs
Normal file
@@ -0,0 +1,496 @@
|
||||
//!
|
||||
//! An in-memory layer stores recently received page versions in memory. The page versions
|
||||
//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
|
||||
//!
|
||||
|
||||
use crate::layered_repository::storage_layer::Layer;
|
||||
use crate::layered_repository::storage_layer::PageVersion;
|
||||
use crate::layered_repository::SnapshotLayer;
|
||||
use crate::relish::*;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
rel: RelishTag,
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
/// start is inclusive. There is no end LSN; we only use in-memory
|
||||
/// layer at the end of a timeline.
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
inner: Mutex<InMemoryLayerInner>,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// If this relation was dropped, remember when that happened. Lsn(0) means
|
||||
/// it hasn't been dropped
|
||||
drop_lsn: Lsn,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
///
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
|
||||
///
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
///
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
fn is_frozen(&self) -> bool {
|
||||
return false;
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
return self.timelineid;
|
||||
}
|
||||
|
||||
fn get_relish_tag(&self) -> RelishTag {
|
||||
return self.rel;
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
return self.start_lsn;
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
return Lsn(u64::MAX);
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.lock().unwrap();
|
||||
return inner.drop_lsn != Lsn(0);
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(
|
||||
&self,
|
||||
walredo_mgr: &dyn WalRedoManager,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<Bytes> {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
let mut page_img: Option<Bytes> = None;
|
||||
let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
|
||||
|
||||
{
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner. page_versions.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img) = &entry.page_image {
|
||||
page_img = Some(img.clone());
|
||||
need_base_image_lsn = None;
|
||||
break;
|
||||
} else if let Some(rec) = &entry.record {
|
||||
records.push(rec.clone());
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_base_image_lsn = None;
|
||||
break;
|
||||
} else {
|
||||
need_base_image_lsn = Some(*entry_lsn);
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'page_versions'
|
||||
}
|
||||
records.reverse();
|
||||
|
||||
// If we needed a base image to apply the WAL records against, we should have found it in memory.
|
||||
if let Some(lsn) = need_base_image_lsn {
|
||||
if records.is_empty() {
|
||||
// no records, and no base image. This can happen if PostgreSQL extends a relation
|
||||
// but never writes the page.
|
||||
//
|
||||
// Would be nice to detect that situation better.
|
||||
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
bail!(
|
||||
"No base image found for page {} blk {} at {}/{}",
|
||||
self.rel,
|
||||
blknum,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
}
|
||||
|
||||
// If we have a page image, and no WAL, we're all set
|
||||
if records.is_empty() {
|
||||
if let Some(img) = page_img {
|
||||
trace!(
|
||||
"found page image for blk {} in {} at {}/{}, no WAL redo required",
|
||||
blknum,
|
||||
self.rel,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
Ok(img)
|
||||
} else {
|
||||
// FIXME: this ought to be an error?
|
||||
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||
Ok(ZERO_PAGE.clone())
|
||||
}
|
||||
} else {
|
||||
// We need to do WAL redo.
|
||||
//
|
||||
// If we don't have a base image, then the oldest WAL record better initialize
|
||||
// the page
|
||||
if page_img.is_none() && !records.first().unwrap().will_init {
|
||||
// FIXME: this ought to be an error?
|
||||
warn!(
|
||||
"Base image for page {}/{} at {} not found, but got {} WAL records",
|
||||
self.rel,
|
||||
blknum,
|
||||
lsn,
|
||||
records.len()
|
||||
);
|
||||
Ok(ZERO_PAGE.clone())
|
||||
} else {
|
||||
if page_img.is_some() {
|
||||
trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||
} else {
|
||||
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||
}
|
||||
let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?;
|
||||
|
||||
self.put_page_image(blknum, lsn, img.clone())?;
|
||||
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_rel_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.inner.lock().unwrap();
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
let result = *entry;
|
||||
drop(inner);
|
||||
trace!("get_relsize: {} at {} -> {}", self.rel, lsn, result);
|
||||
Ok(result)
|
||||
} else {
|
||||
bail!("No size found for {} at {} in memory", self.rel, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
/// Does this relation exist at given LSN?
|
||||
fn get_rel_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
let result = if let Some((_entry_lsn, _entry)) = iter.next_back() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Write operations
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()> {
|
||||
trace!(
|
||||
"put_page_version blk {} of {} at {}/{}",
|
||||
blknum,
|
||||
self.rel,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
let old = inner.page_versions.insert((blknum, lsn), pv);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!(
|
||||
"Page version of rel {:?} blk {} at {} already exists",
|
||||
self.rel, blknum, lsn
|
||||
);
|
||||
}
|
||||
|
||||
// Also update the relation size, if this extended the relation.
|
||||
if self.rel.is_blocky() {
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
let oldsize;
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
oldsize = *entry;
|
||||
} else {
|
||||
oldsize = 0;
|
||||
//bail!("No old size found for {} at {}", self.tag, lsn);
|
||||
}
|
||||
if blknum >= oldsize {
|
||||
trace!(
|
||||
"enlarging relation {} from {} to {} blocks at {}",
|
||||
self.rel,
|
||||
oldsize,
|
||||
blknum + 1,
|
||||
lsn
|
||||
);
|
||||
inner.relsizes.insert(lsn, blknum + 1);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
fn put_truncation(&self, lsn: Lsn, relsize: u32) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
let old = inner.relsizes.insert(lsn, relsize);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
assert!(inner.drop_lsn == Lsn(0));
|
||||
inner.drop_lsn = lsn;
|
||||
|
||||
info!("dropped relation {} at {}", self.rel, lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Write the this in-memory layer to disk, as a snapshot layer.
|
||||
///
|
||||
/// The cutoff point for the layer that's written to disk is 'end_lsn'.
|
||||
/// If there were page versions newer than 'end_lsn', a new in-memory
|
||||
/// layer is returned with those page versions. Otherwise returns None.
|
||||
///
|
||||
fn freeze(&self, end_lsn: Lsn, walredo_mgr: &dyn WalRedoManager) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
info!(
|
||||
"freezing in memory layer for {} on timeline {} at {}",
|
||||
self.rel, self.timelineid, end_lsn
|
||||
);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
let dropped = inner.drop_lsn != Lsn(0);
|
||||
|
||||
// Divide all the page versions into old and new at the 'end_lsn' cutoff point.
|
||||
let mut old_page_versions = BTreeMap::new();
|
||||
let mut old_relsizes = BTreeMap::new();
|
||||
let mut new_relsizes = BTreeMap::new();
|
||||
let mut new_page_versions = BTreeMap::new();
|
||||
|
||||
if !dropped {
|
||||
for (lsn, size) in inner.relsizes.iter() {
|
||||
if *lsn > end_lsn {
|
||||
new_relsizes.insert(*lsn, *size);
|
||||
} else {
|
||||
old_relsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
if *lsn > end_lsn {
|
||||
new_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
} else {
|
||||
old_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let end_lsn = if dropped {
|
||||
assert!(inner.drop_lsn < end_lsn);
|
||||
inner.drop_lsn
|
||||
} else {
|
||||
end_lsn
|
||||
};
|
||||
|
||||
// Write the old page versions to disk.
|
||||
let snapfile = SnapshotLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.rel,
|
||||
self.start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
old_page_versions,
|
||||
old_relsizes,
|
||||
)?;
|
||||
let mut result: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
|
||||
// If there were any "new" page versions, initialize a new in-memory layer to hold
|
||||
// them
|
||||
if !new_relsizes.is_empty() || !new_page_versions.is_empty() {
|
||||
info!("created new in-mem layer for {} {}-", self.rel, end_lsn);
|
||||
|
||||
let new_layer = Self::copy_snapshot(self.conf, walredo_mgr, &snapfile, self.timelineid, self.tenantid, end_lsn)?;
|
||||
let mut new_inner = new_layer.inner.lock().unwrap();
|
||||
new_inner.page_versions.append(&mut new_page_versions);
|
||||
new_inner.relsizes.append(&mut new_relsizes);
|
||||
drop(new_inner);
|
||||
|
||||
result.push(Arc::new(new_layer));
|
||||
}
|
||||
result.push(Arc::new(snapfile));
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// Nothing to do. When the reference is dropped, the memory is released.
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unload(&self) -> Result<()> {
|
||||
// cannot unload in-memory layer. Freeze instead
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
///
|
||||
/// Create a new, empty, in-memory layer
|
||||
///
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
rel: RelishTag,
|
||||
start_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
|
||||
rel,
|
||||
timelineid,
|
||||
start_lsn
|
||||
);
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
rel,
|
||||
start_lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: Lsn(0),
|
||||
page_versions: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize a new InMemoryLayer for, by copying the state at the given
|
||||
/// point in time from given existing layer.
|
||||
///
|
||||
pub fn copy_snapshot(
|
||||
conf: &'static PageServerConf,
|
||||
walredo_mgr: &dyn WalRedoManager,
|
||||
src: &dyn Layer,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
src.get_relish_tag(),
|
||||
timelineid,
|
||||
lsn
|
||||
);
|
||||
let mut page_versions = BTreeMap::new();
|
||||
let mut relsizes = BTreeMap::new();
|
||||
|
||||
let size;
|
||||
if src.get_relish_tag().is_blocky() {
|
||||
size = src.get_rel_size(lsn)?;
|
||||
relsizes.insert(lsn, size);
|
||||
} else {
|
||||
size = 1;
|
||||
}
|
||||
|
||||
for blknum in 0..size {
|
||||
let img = src.get_page_at_lsn(walredo_mgr, blknum, lsn)?;
|
||||
let pv = PageVersion {
|
||||
page_image: Some(img),
|
||||
record: None,
|
||||
};
|
||||
page_versions.insert((blknum, lsn), pv);
|
||||
}
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
rel: src.get_relish_tag(),
|
||||
start_lsn: lsn,
|
||||
inner: Mutex::new(InMemoryLayerInner {
|
||||
drop_lsn: Lsn(0),
|
||||
page_versions: page_versions,
|
||||
relsizes: relsizes,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> String {
|
||||
let mut result = format!(
|
||||
"----- inmemory layer for {} {}-> ----\n",
|
||||
self.rel, self.start_lsn
|
||||
);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
|
||||
for (k, v) in inner.relsizes.iter() {
|
||||
result += &format!("{}: {}\n", k, v);
|
||||
}
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
result += &format!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
127
pageserver/src/layered_repository/layer_map.rs
Normal file
127
pageserver/src/layered_repository/layer_map.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
///
|
||||
/// The layer map tracks what layers exist for all the relations in a timeline.
|
||||
///
|
||||
/// When the timeline is first accessed, the server lists of all snapshot files
|
||||
/// in the timelines/<timelineid> directory, and populates this map with
|
||||
/// SnapshotLayers corresponding to each file. When new WAL is received,
|
||||
/// we create InMemoryLayers to hold the incoming records. Now and then,
|
||||
/// in the checkpoint() function, the in-memory layers are frozen, forming
|
||||
/// new snapshot layers and corresponding files are written to disk.
|
||||
///
|
||||
|
||||
use anyhow::Result;
|
||||
use log::*;
|
||||
use crate::relish::*;
|
||||
use crate::layered_repository::storage_layer::Layer;
|
||||
use std::collections::HashSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// LayerMap is a BTreeMap keyed by RelishTag and the layer's start LSN.
|
||||
/// It provides a couple of convenience functions over a plain BTreeMap
|
||||
pub struct LayerMap {
|
||||
pub inner: BTreeMap<(RelishTag, Lsn), Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Look up using the given rel tag and LSN. This differs from a plain
|
||||
/// key-value lookup in that if there is any layer that covers the
|
||||
/// given LSN, or precedes the given LSN, it is returned. In other words,
|
||||
/// you don't need to know the exact start LSN of the layer.
|
||||
///
|
||||
pub fn get(&self, tag: RelishTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let startkey = (tag, Lsn(0));
|
||||
let endkey = (tag, lsn);
|
||||
|
||||
if let Some((_k, v)) = self
|
||||
.inner
|
||||
.range((Included(startkey), Included(endkey)))
|
||||
.next_back()
|
||||
{
|
||||
Some(Arc::clone(v))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, layer: Arc<dyn Layer>) {
|
||||
let rel = layer.get_relish_tag();
|
||||
let start_lsn = layer.get_start_lsn();
|
||||
|
||||
self.inner.insert((rel, start_lsn), Arc::clone(&layer));
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, layer: &dyn Layer) {
|
||||
let rel = layer.get_relish_tag();
|
||||
let start_lsn = layer.get_start_lsn();
|
||||
|
||||
self.inner.remove(&(rel, start_lsn));
|
||||
}
|
||||
|
||||
pub fn list_rels(&self,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
) -> Result<HashSet<RelTag>> {
|
||||
let mut rels: HashSet<RelTag> = HashSet::new();
|
||||
|
||||
// Scan the timeline directory to get all rels in this timeline.
|
||||
for ((rel, _lsn), _l) in self.inner.iter() {
|
||||
if let RelishTag::Relation(reltag) = rel {
|
||||
// FIXME: skip if it was dropped before the requested LSN. But there is no
|
||||
// LSN argument
|
||||
|
||||
if (spcnode == 0 || reltag.spcnode == spcnode)
|
||||
&& (dbnode == 0 || reltag.dbnode == dbnode)
|
||||
{
|
||||
rels.insert(*reltag);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
pub fn list_nonrels(&self, _lsn: Lsn) -> Result<HashSet<RelishTag>> {
|
||||
let mut rels: HashSet<RelishTag> = HashSet::new();
|
||||
|
||||
// Scan the timeline directory to get all rels in this timeline.
|
||||
for ((rel, _lsn), _l) in self.inner.iter() {
|
||||
// FIXME: skip if it was dropped before the requested LSN.
|
||||
|
||||
if let RelishTag::Relation(_) = rel {
|
||||
} else {
|
||||
rels.insert(*rel);
|
||||
}
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Is there a newer layer for given relation?
|
||||
pub fn newer_layer_exists(&self, rel: RelishTag, lsn: Lsn) -> bool {
|
||||
let startkey = (rel, lsn);
|
||||
let endkey = (rel, Lsn(u64::MAX));
|
||||
|
||||
for ((_rel, newer_lsn), layer) in self
|
||||
.inner
|
||||
.range((Included(startkey), Included(endkey)))
|
||||
{
|
||||
if layer.get_end_lsn() > lsn {
|
||||
info!("found later layer for rel {}, {} {}-{}", rel, lsn, newer_lsn, layer.get_end_lsn());
|
||||
return true;
|
||||
} else {
|
||||
info!("found singleton layer for rel {}, {} {}", rel, lsn, newer_lsn);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
info!("no later layer found for rel {}, {}", rel, lsn);
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LayerMap {
|
||||
fn default() -> Self {
|
||||
LayerMap { inner: BTreeMap::new() }
|
||||
}
|
||||
}
|
||||
632
pageserver/src/layered_repository/snapshot_layer.rs
Normal file
632
pageserver/src/layered_repository/snapshot_layer.rs
Normal file
@@ -0,0 +1,632 @@
|
||||
//!
|
||||
//! A SnapshotLayer represents one snapshot file on disk. One file holds all page versions
|
||||
//! and size information of one relation, in a range of LSN.
|
||||
//! The name "snapshot file" is a bit of a misnomer because a snapshot file doesn't
|
||||
//! contain a snapshot at a specific LSN, but rather all the page versions in a range
|
||||
//! of LSNs.
|
||||
//!
|
||||
//! Currently, a snapshot file contains full information needed to reconstruct any
|
||||
//! page version in the LSN range, without consulting any other snapshot files. When
|
||||
//! a new snapshot file is created for writing, the full contents of relation are
|
||||
//! materialized as it is at the beginning of the LSN range. That can be very expensive,
|
||||
//! we should find a way to store differential files. But this keeps the read-side
|
||||
//! of things simple. You can find the correct snapshot file based on RelishTag and
|
||||
//! timeline+LSN, and once you've located it, you have all the data you need to in that
|
||||
//! file.
|
||||
//!
|
||||
//! When a snapshot file needs to be accessed, we slurp the whole file into memory, into
|
||||
//! a SnapshotLayer struct.
|
||||
//!
|
||||
//! On disk, the snapshot files are stored in .zenith/timelines/<timelineid> directory.
|
||||
//! Currently, there are no subdirectories, and each snapshot file is named like this:
|
||||
//!
|
||||
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! 1663_13990_2609_0_000000000169C348_000000000169C349
|
||||
//!
|
||||
//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
|
||||
//! So the above example would become:
|
||||
//!
|
||||
//! 1663_13990_2609_0_000000000169C348_000000000169C349_DROPPED
|
||||
//!
|
||||
//! The end LSN indicates when it was dropped in that case, we don't store it in the
|
||||
//! file contents in any way.
|
||||
//!
|
||||
//! A snapshot file is constructed using the 'bookfile' crate. Each file consists of two
|
||||
//! parts: the page versions and the relation sizes. They are stored as separate chapters.
|
||||
//!
|
||||
use crate::layered_repository::storage_layer::Layer;
|
||||
use crate::layered_repository::storage_layer::PageVersion;
|
||||
use crate::layered_repository::storage_layer::ZERO_PAGE;
|
||||
use crate::relish::*;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTimelineId, ZTenantId};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Magic constant to identify a Zenith snapshot file
|
||||
static SNAPSHOT_FILE_MAGIC: u32 = 0x5A616E01;
|
||||
|
||||
static PAGE_VERSIONS_CHAPTER: u64 = 1;
|
||||
static REL_SIZES_CHAPTER: u64 = 2;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
struct SnapshotFileName {
|
||||
rel: RelishTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
}
|
||||
|
||||
impl SnapshotFileName {
|
||||
fn from_str(fname: &str) -> Option<Self> {
|
||||
// Split the filename into parts
|
||||
//
|
||||
// <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>
|
||||
//
|
||||
// or if it was dropped:
|
||||
//
|
||||
// <spcnode>_<dbnode>_<relnode>_<forknum>_<start LSN>_<end LSN>_DROPPED
|
||||
//
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
|
||||
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
let mut dropped = false;
|
||||
if let Some(suffix) = parts.next() {
|
||||
if suffix == "DROPPED" {
|
||||
dropped = true;
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
return None;
|
||||
}
|
||||
}
|
||||
if parts.next().is_some() {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(SnapshotFileName {
|
||||
rel,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
})
|
||||
}
|
||||
|
||||
fn to_string(&self) -> String {
|
||||
let basename = match self.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => format!("pg_control_checkpoint"),
|
||||
RelishTag::ControlFile => format!("pg_control"),
|
||||
};
|
||||
|
||||
format!(
|
||||
"{}_{:016X}_{:016X}{}",
|
||||
basename,
|
||||
u64::from(self.start_lsn),
|
||||
u64::from(self.end_lsn),
|
||||
if self.dropped { "_DROPPED" } else { "" }
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SnapshotFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// SnapshotLayer is the in-memory data structure associated with an on-disk snapshot file.
|
||||
/// We hold a SnapshotLayer in memory for each file, in the LayerMap. If a layer is in
|
||||
/// "loaded" state, we have a copy of the file in memory, in 'inner'. Otherwise the struct
|
||||
/// is just a placeholder for a file that exists in memory, and it needs to be loaded
|
||||
/// before using it in queries.
|
||||
///
|
||||
pub struct SnapshotLayer {
|
||||
conf: &'static PageServerConf,
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub rel: RelishTag,
|
||||
|
||||
//
|
||||
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
|
||||
// start is inclusive, and end is exclusive.
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
|
||||
inner: Mutex<SnapshotLayerInner>
|
||||
}
|
||||
|
||||
pub struct SnapshotLayerInner {
|
||||
// If false, the 'page_versions' and 'relsizes' have not been loaded into memory
|
||||
// yet.
|
||||
loaded: bool,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
///
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
|
||||
///
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
///
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl Layer for SnapshotLayer {
|
||||
fn is_frozen(&self) -> bool {
|
||||
return true;
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
return self.timelineid;
|
||||
}
|
||||
|
||||
fn get_relish_tag(&self) -> RelishTag {
|
||||
return self.rel;
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
return self.dropped;
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
return self.start_lsn;
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
return self.end_lsn;
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(
|
||||
&self,
|
||||
walredo_mgr: &dyn WalRedoManager,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<Bytes> {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
let mut page_img: Option<Bytes> = None;
|
||||
let mut need_base_image_lsn: Option<Lsn> = Some(lsn);
|
||||
{
|
||||
let inner = self.load()?;
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner.page_versions.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img) = &entry.page_image {
|
||||
page_img = Some(img.clone());
|
||||
need_base_image_lsn = None;
|
||||
break;
|
||||
} else if let Some(rec) = &entry.record {
|
||||
records.push(rec.clone());
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_base_image_lsn = None;
|
||||
break;
|
||||
} else {
|
||||
need_base_image_lsn = Some(*entry_lsn);
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
}
|
||||
records.reverse();
|
||||
|
||||
// If we needed a base image to apply the WAL records against, we should have found it in memory.
|
||||
if let Some(lsn) = need_base_image_lsn {
|
||||
if records.is_empty() {
|
||||
// no records, and no base image. This can happen if PostgreSQL extends a relation
|
||||
// but never writes the page.
|
||||
//
|
||||
// Would be nice to detect that situation better.
|
||||
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
bail!(
|
||||
"No base image found for page {} blk {} at {}/{}",
|
||||
self.rel,
|
||||
blknum,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
}
|
||||
|
||||
// If we have a page image, and no WAL, we're all set
|
||||
if records.is_empty() {
|
||||
if let Some(img) = page_img {
|
||||
trace!(
|
||||
"found page image for blk {} in {} at {}/{}, no WAL redo required",
|
||||
blknum,
|
||||
self.rel,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
Ok(img)
|
||||
} else {
|
||||
// FIXME: this ought to be an error?
|
||||
warn!("Page {} blk {} at {} not found", self.rel, blknum, lsn);
|
||||
Ok(ZERO_PAGE.clone())
|
||||
}
|
||||
} else {
|
||||
// We need to do WAL redo.
|
||||
//
|
||||
// If we don't have a base image, then the oldest WAL record better initialize
|
||||
// the page
|
||||
if page_img.is_none() && !records.first().unwrap().will_init {
|
||||
// FIXME: this ought to be an error?
|
||||
warn!(
|
||||
"Base image for page {} blk {} at {} not found, but got {} WAL records",
|
||||
self.rel,
|
||||
blknum,
|
||||
lsn,
|
||||
records.len()
|
||||
);
|
||||
Ok(ZERO_PAGE.clone())
|
||||
} else {
|
||||
if page_img.is_some() {
|
||||
trace!("found {} WAL records and a base image for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||
} else {
|
||||
trace!("found {} WAL records that will init the page for blk {} in {} at {}/{}, performing WAL redo", records.len(), blknum, self.rel, self.timelineid, lsn);
|
||||
}
|
||||
let img = walredo_mgr.request_redo(self.rel, blknum, lsn, page_img, records)?;
|
||||
|
||||
// FIXME: Should we memoize the page image in memory, so that
|
||||
// we wouldn't need to reconstruct it again, if it's requested again?
|
||||
//self.put_page_image(blknum, lsn, img.clone())?;
|
||||
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_rel_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
let result = *entry;
|
||||
drop(inner);
|
||||
trace!("get_relsize: {} at {} -> {}", self.rel, lsn, result);
|
||||
Ok(result)
|
||||
} else {
|
||||
error!("No size found for {} at {} in snapshot layer {} {} {}", self.rel, lsn, self.rel, self.start_lsn, self.end_lsn);
|
||||
bail!("No size found for {} at {} in snapshot layer", self.rel, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
/// Does this relation exist at given LSN?
|
||||
fn get_rel_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
let result = if let Some((_entry_lsn, _entry)) = iter.next_back() {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
// Unsupported write operations
|
||||
fn put_page_version(&self, blknum: u32, lsn: Lsn, _pv: PageVersion) -> Result<()> {
|
||||
panic!(
|
||||
"cannot modify historical snapshot layer, rel {} blk {} at {}/{}, {}-{}",
|
||||
self.rel, blknum, self.timelineid, lsn, self.start_lsn, self.end_lsn
|
||||
);
|
||||
}
|
||||
fn put_truncation(&self, _lsn: Lsn, _relsize: u32) -> anyhow::Result<()> {
|
||||
bail!("cannot modify historical snapshot layer");
|
||||
}
|
||||
|
||||
fn put_unlink(&self, _lsn: Lsn) -> anyhow::Result<()> {
|
||||
bail!("cannot modify historical snapshot layer");
|
||||
}
|
||||
|
||||
fn freeze(&self, _end_lsn: Lsn, _walredo_mgr: &dyn WalRedoManager) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
bail!("cannot freeze historical snapshot layer");
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.page_versions = BTreeMap::new();
|
||||
inner.relsizes = BTreeMap::new();
|
||||
inner.loaded = false;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl SnapshotLayer {
|
||||
fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&SnapshotFileName {
|
||||
rel: self.rel,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn path_for(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
fname: &SnapshotFileName,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid).join(fname.to_string())
|
||||
}
|
||||
|
||||
/// Create a new snapshot file, using the given btreemaps containing the page versions and
|
||||
/// relsizes.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
pub fn create(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
rel: RelishTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
) -> Result<SnapshotLayer> {
|
||||
let snapfile = SnapshotLayer {
|
||||
conf: conf,
|
||||
timelineid: timelineid,
|
||||
tenantid: tenantid,
|
||||
rel: rel,
|
||||
start_lsn: start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
inner: Mutex::new(SnapshotLayerInner {
|
||||
loaded: true,
|
||||
page_versions: page_versions,
|
||||
relsizes: relsizes,
|
||||
}),
|
||||
};
|
||||
let inner = snapfile.inner.lock().unwrap();
|
||||
|
||||
// Write the in-memory btreemaps into files
|
||||
let path = snapfile.path();
|
||||
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
|
||||
let file = File::create(&path)?;
|
||||
let book = BookWriter::new(file, SNAPSHOT_FILE_MAGIC)?;
|
||||
|
||||
// Write out page versions
|
||||
let mut chapter = book.new_chapter(PAGE_VERSIONS_CHAPTER);
|
||||
let buf = BTreeMap::ser(&inner.page_versions)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// and relsizes to separate chapter
|
||||
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
|
||||
let buf = BTreeMap::ser(&inner.relsizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
Ok(snapfile)
|
||||
}
|
||||
|
||||
fn load(&self) -> Result<MutexGuard<SnapshotLayerInner>> {
|
||||
|
||||
// quick exit if already loaded
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = Self::path_for(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&SnapshotFileName {
|
||||
rel: self.rel,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
);
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let mut book = Book::new(file)?;
|
||||
|
||||
let chapter_index = book
|
||||
.find_chapter(PAGE_VERSIONS_CHAPTER)
|
||||
.ok_or_else(|| anyhow!("could not find page versions chapter in {}", path.display()))?;
|
||||
let chapter = book.read_chapter(chapter_index)?;
|
||||
let page_versions = BTreeMap::des(&chapter)?;
|
||||
|
||||
let chapter_index = book
|
||||
.find_chapter(REL_SIZES_CHAPTER)
|
||||
.ok_or_else(|| anyhow!("could not find relsizes chapter in {}", path.display()))?;
|
||||
let chapter = book.read_chapter(chapter_index)?;
|
||||
let relsizes = BTreeMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
*inner = SnapshotLayerInner {
|
||||
loaded: true,
|
||||
page_versions,
|
||||
relsizes,
|
||||
};
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
|
||||
// TODO: returning an Iterator would be more idiomatic
|
||||
pub fn list_snapshot_files(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
let path = conf.timeline_path(&timelineid, &tenantid);
|
||||
|
||||
let mut snapfiles: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
for direntry in fs::read_dir(path)? {
|
||||
let fname = direntry?.file_name();
|
||||
let fname = fname.to_str().unwrap();
|
||||
|
||||
if let Some(snapfilename) = SnapshotFileName::from_str(fname) {
|
||||
|
||||
let snapfile = SnapshotLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
rel: snapfilename.rel,
|
||||
start_lsn: snapfilename.start_lsn,
|
||||
end_lsn: snapfilename.end_lsn,
|
||||
dropped: snapfilename.dropped,
|
||||
inner: Mutex::new(SnapshotLayerInner {
|
||||
loaded: false,
|
||||
page_versions: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
}),
|
||||
};
|
||||
|
||||
snapfiles.push(Arc::new(snapfile));
|
||||
}
|
||||
}
|
||||
return Ok(snapfiles);
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> String {
|
||||
let mut result = format!(
|
||||
"----- snapshot layer for {} {}-{} ----\n",
|
||||
self.rel, self.start_lsn, self.end_lsn
|
||||
);
|
||||
|
||||
let inner = self.inner.lock().unwrap();
|
||||
for (k, v) in inner.relsizes.iter() {
|
||||
result += &format!("{}: {}\n", k, v);
|
||||
}
|
||||
//for (k, v) in inner.page_versions.iter() {
|
||||
// result += &format!("blk {} at {}: {}/{}\n", k.0, k.1, v.page_image.is_some(), v.record.is_some());
|
||||
//}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
85
pageserver/src/layered_repository/storage_layer.rs
Normal file
85
pageserver/src/layered_repository/storage_layer.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Arc;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
///
|
||||
/// Represents a version of a page at a specific LSN. The LSN is the key of the
|
||||
/// entry in the 'page_versions' hash, it is not duplicated here.
|
||||
///
|
||||
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||
/// to be applied over the previous page version to reconstruct this version.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PageVersion {
|
||||
/// an 8kb page image
|
||||
pub page_image: Option<Bytes>,
|
||||
/// WAL record to get from previous page version to this one.
|
||||
pub record: Option<WALRecord>,
|
||||
}
|
||||
|
||||
pub trait Layer: Send + Sync {
|
||||
fn is_frozen(&self) -> bool;
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
fn get_relish_tag(&self) -> RelishTag;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
fn get_end_lsn(&self) -> Lsn;
|
||||
fn is_dropped(&self) -> bool;
|
||||
|
||||
fn get_page_at_lsn(
|
||||
&self,
|
||||
walredo_mgr: &dyn WalRedoManager,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<Bytes>;
|
||||
|
||||
fn get_rel_size(&self, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
fn get_rel_exists(&self, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<()>;
|
||||
|
||||
fn put_truncation(&self, lsn: Lsn, relsize: u32) -> anyhow::Result<()>;
|
||||
|
||||
fn put_unlink(&self, lsn: Lsn) -> anyhow::Result<()>;
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> Result<()> {
|
||||
// FIXME: If this is the first version of this page, reconstruct the image
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
rec.lsn,
|
||||
PageVersion {
|
||||
page_image: None,
|
||||
record: Some(rec),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()> {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
PageVersion {
|
||||
page_image: Some(img),
|
||||
record: None,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn freeze(&self, end_lsn: Lsn, walredo_mgr: &dyn WalRedoManager) -> Result<Vec<Arc<dyn Layer>>>;
|
||||
|
||||
fn delete(&self) -> Result<()>;
|
||||
|
||||
fn unload(&self) -> Result<()>;
|
||||
}
|
||||
@@ -9,12 +9,14 @@ use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod layered_repository;
|
||||
pub mod logger;
|
||||
pub mod object_key;
|
||||
pub mod object_repository;
|
||||
pub mod object_store;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod rocksdb_storage;
|
||||
@@ -39,6 +41,14 @@ pub struct PageServerConf {
|
||||
pub workdir: PathBuf,
|
||||
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
pub repository_format: RepositoryFormat,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum RepositoryFormat {
|
||||
Layered,
|
||||
RocksDb,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
@@ -182,7 +192,7 @@ impl fmt::Display for ZId {
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId(ZId);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use crate::repository::{BufferTag, RelTag};
|
||||
use crate::waldecoder::TransactionId;
|
||||
//!
|
||||
//! Common structs shared by object_repository.rs and object_store.rs.
|
||||
//!
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use crate::ZTimelineId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -8,6 +11,7 @@ use serde::{Deserialize, Serialize};
|
||||
/// repository. It is shared between object_repository.rs and object_store.rs.
|
||||
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
|
||||
/// using the key given by the caller.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ObjectKey {
|
||||
pub timeline: ZTimelineId,
|
||||
@@ -15,70 +19,31 @@ pub struct ObjectKey {
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
|
||||
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into pages
|
||||
/// of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SlruBufferTag {
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Special type of Postgres files: pg_filenode.map is needed to map
|
||||
/// catalog table OIDs to filenode numbers, which define filename.
|
||||
///
|
||||
/// Each database has a map file for its local mapped catalogs,
|
||||
/// and there is a separate map file for shared catalogs.
|
||||
///
|
||||
/// These files have untypical size of 512 bytes.
|
||||
///
|
||||
/// See PostgreSQL relmapper.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DatabaseTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation files that keep state for prepared transactions.
|
||||
/// Unlike other files these are not divided into pages.
|
||||
///
|
||||
/// See PostgreSQL twophase.c for details.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct PrepareTag {
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// ObjectTag is a part of ObjectKey that is specific to the type of
|
||||
/// the stored object.
|
||||
///
|
||||
/// NB: the order of the enum values is significant! In particular,
|
||||
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
|
||||
///
|
||||
/// Buffer is the kind of object that is accessible by the public
|
||||
/// get_page_at_lsn() / put_page_image() / put_wal_record() functions in
|
||||
/// the repository.rs interface. The rest are internal objects stored in
|
||||
/// the key-value store, to store various metadata. They're not directly
|
||||
/// accessible outside object_repository.rs
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum ObjectTag {
|
||||
// dummy tag preceeding all other keys
|
||||
FirstTag,
|
||||
|
||||
// Metadata about a timeline. Not versioned.
|
||||
TimelineMetadataTag,
|
||||
// Special entry that represents PostgreSQL checkpoint.
|
||||
// We use it to track fields needed to restore controlfile checkpoint.
|
||||
Checkpoint,
|
||||
// Various types of non-relation files.
|
||||
// We need them to bootstrap compute node.
|
||||
ControlFile,
|
||||
Clog(SlruBufferTag),
|
||||
MultiXactMembers(SlruBufferTag),
|
||||
MultiXactOffsets(SlruBufferTag),
|
||||
FileNodeMap(DatabaseTag),
|
||||
TwoPhase(PrepareTag),
|
||||
// put relations at the end of enum to allow efficient iterations through non-rel objects
|
||||
RelationMetadata(RelTag),
|
||||
RelationBuffer(BufferTag),
|
||||
|
||||
// These objects store metadata about one relish. Currently it's used
|
||||
// just to track the relish's size. It's not used for non-blocky relishes
|
||||
// at all.
|
||||
RelationMetadata(RelishTag),
|
||||
|
||||
// These are the pages exposed in the public Repository/Timeline interface.
|
||||
Buffer(RelishTag, u32),
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
//! key-value store for each timeline.
|
||||
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
use crate::restore_local_repo::import_timeline_wal;
|
||||
use crate::walredo::WalRedoManager;
|
||||
@@ -162,20 +163,23 @@ impl Repository for ObjectRepository {
|
||||
&ObjectValue::ser(&val)?,
|
||||
)?;
|
||||
|
||||
// Copy non-rel objects
|
||||
for tag in src_timeline.list_nonrels(at_lsn)? {
|
||||
match tag {
|
||||
ObjectTag::TimelineMetadataTag => {} // skip it
|
||||
_ => {
|
||||
let img = src_timeline.get_page_at_lsn_nowait(tag, at_lsn)?;
|
||||
let val = ObjectValue::Page(PageEntry::Page(img));
|
||||
let key = ObjectKey { timeline: dst, tag };
|
||||
self.obj_store.put(&key, at_lsn, &ObjectValue::ser(&val)?)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn gc_iteration(&self, timelineid: Option<ZTimelineId>, horizon: u64, compact: bool) -> Result<GcResult> {
|
||||
if let Some(timelineid) = timelineid {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// FIXME: If the timeline isn't opened yet, don't open it just for GC.
|
||||
if let Some(timeline) = timelines.get(&timelineid) {
|
||||
return timeline.gc_iteration(horizon, compact);
|
||||
}
|
||||
} else {
|
||||
// FIXME: the object repository doesn't support GC on all timelines. Should
|
||||
// iterate all the timelines here
|
||||
}
|
||||
return Ok(GcResult::default())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -187,7 +191,7 @@ impl Repository for ObjectRepository {
|
||||
/// To prevent memory overflow metadata only of the most recent version of relation is cached.
|
||||
/// If page server needs to access some older version, then object storage has to be accessed.
|
||||
///
|
||||
struct RelMetadata {
|
||||
struct RelishMetadata {
|
||||
size: u32, // size of relation
|
||||
last_updated: Lsn, // lsn of last metadata update (used to determine when cache value can be used)
|
||||
}
|
||||
@@ -227,7 +231,7 @@ pub struct ObjectTimeline {
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
|
||||
rel_meta: RwLock<BTreeMap<RelTag, RelMetadata>>,
|
||||
rel_meta: RwLock<BTreeMap<RelishTag, RelishMetadata>>,
|
||||
}
|
||||
|
||||
impl ObjectTimeline {
|
||||
@@ -266,19 +270,28 @@ impl Timeline for ObjectTimeline {
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: ObjectTag, req_lsn: Lsn) -> Result<Bytes> {
|
||||
fn get_page_at_lsn(&self, tag: RelishTag, blknum: u32, req_lsn: Lsn) -> Result<Bytes> {
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
self.get_page_at_lsn_nowait(tag, lsn)
|
||||
self.get_page_at_lsn_nowait(tag, blknum, lsn)
|
||||
}
|
||||
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, req_lsn: Lsn) -> Result<Bytes> {
|
||||
fn get_page_at_lsn_nowait(&self, rel: RelishTag, blknum: u32, req_lsn: Lsn) -> Result<Bytes> {
|
||||
if !rel.is_blocky() && blknum != 0 {
|
||||
bail!(
|
||||
"invalid request for block {} for non-blocky relish {}",
|
||||
blknum,
|
||||
rel
|
||||
);
|
||||
}
|
||||
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
// Look up the page entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let object_tag = ObjectTag::Buffer(rel, blknum);
|
||||
let searchkey = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
tag: object_tag,
|
||||
};
|
||||
let mut iter = self.object_versions(&*self.obj_store, &searchkey, req_lsn)?;
|
||||
|
||||
@@ -291,14 +304,16 @@ impl Timeline for ObjectTimeline {
|
||||
}
|
||||
ObjectValue::Page(PageEntry::WALRecord(_rec)) => {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn)?;
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
let (base_img, records) = self.collect_records_for_apply(rel, blknum, lsn)?;
|
||||
page_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(rel, blknum, lsn, base_img, records)?;
|
||||
|
||||
// Garbage collection assumes that we remember the materialized page
|
||||
// version. Otherwise we could opt to not do it, with the downside that
|
||||
// the next GetPage@LSN call of the same page version would have to
|
||||
// redo the WAL again.
|
||||
self.put_page_image(tag, lsn, page_img.clone(), false)?;
|
||||
self.put_page_image(rel, blknum, lsn, page_img.clone(), false)?;
|
||||
}
|
||||
ObjectValue::SLRUTruncate => page_img = Bytes::from_static(&ZERO_PAGE),
|
||||
_ => bail!("Invalid object kind, expected a page entry or SLRU truncate"),
|
||||
@@ -310,19 +325,23 @@ impl Timeline for ObjectTimeline {
|
||||
"Returning page with LSN {:X}/{:X} for {:?} from {} (request {})",
|
||||
page_lsn_hi,
|
||||
page_lsn_lo,
|
||||
tag,
|
||||
object_tag,
|
||||
lsn,
|
||||
req_lsn
|
||||
);
|
||||
return Ok(page_img);
|
||||
}
|
||||
trace!("page {:?} at {} not found", tag, req_lsn);
|
||||
trace!("page {:?} at {} not found", object_tag, req_lsn);
|
||||
Ok(Bytes::from_static(&ZERO_PAGE))
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
|
||||
/// Get size of relation
|
||||
fn get_rel_size(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
|
||||
fn get_rel_size(&self, rel: RelishTag, lsn: Lsn) -> Result<u32> {
|
||||
if !rel.is_blocky() {
|
||||
bail!("invalid get_rel_size request for non-blocky relish {}", rel);
|
||||
}
|
||||
|
||||
let lsn = self.wait_lsn(lsn)?;
|
||||
|
||||
match self.relsize_get_nowait(rel, lsn)? {
|
||||
@@ -332,7 +351,7 @@ impl Timeline for ObjectTimeline {
|
||||
}
|
||||
|
||||
/// Does relation exist at given LSN?
|
||||
fn get_rel_exists(&self, rel: RelTag, req_lsn: Lsn) -> Result<bool> {
|
||||
fn get_rel_exists(&self, rel: RelishTag, req_lsn: Lsn) -> Result<bool> {
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
{
|
||||
let rel_meta = self.rel_meta.read().unwrap();
|
||||
@@ -353,8 +372,34 @@ impl Timeline for ObjectTimeline {
|
||||
}
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
self.obj_store.list_objects(self.timelineid, true, lsn)
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<HashSet<RelishTag>> {
|
||||
// List all non-relations in this timeline.
|
||||
let mut all_rels = self.obj_store.list_nonrels(self.timelineid, lsn)?;
|
||||
|
||||
// Also list all nonrelations in ancestor timelines. If a nonrelation hasn't been modified
|
||||
// after the fork, there will be no trace of it in the object store with the current
|
||||
// timeline id.
|
||||
let mut prev_timeline: Option<ZTimelineId> = self.ancestor_timeline;
|
||||
let mut lsn = self.ancestor_lsn;
|
||||
while let Some(timeline) = prev_timeline {
|
||||
let this_rels = self.obj_store.list_nonrels(timeline, lsn)?;
|
||||
|
||||
for rel in this_rels {
|
||||
all_rels.insert(rel);
|
||||
}
|
||||
|
||||
// Load ancestor metadata.
|
||||
let v = self
|
||||
.obj_store
|
||||
.get(&timeline_metadata_key(timeline), Lsn(0))
|
||||
.with_context(|| "timeline not found in repository")?;
|
||||
let metadata = ObjectValue::des_timeline_metadata(&v)?;
|
||||
|
||||
prev_timeline = metadata.ancestor_timeline;
|
||||
lsn = metadata.ancestor_lsn;
|
||||
}
|
||||
|
||||
Ok(all_rels)
|
||||
}
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
@@ -400,31 +445,39 @@ impl Timeline for ObjectTimeline {
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()> {
|
||||
fn put_wal_record(&self, rel: RelishTag, blknum: u32, rec: WALRecord) -> Result<()> {
|
||||
if !rel.is_blocky() && blknum != 0 {
|
||||
bail!(
|
||||
"invalid request for block {} for non-blocky relish {}",
|
||||
blknum,
|
||||
rel
|
||||
);
|
||||
}
|
||||
|
||||
let lsn = rec.lsn;
|
||||
self.put_page_entry(&tag, lsn, PageEntry::WALRecord(rec))?;
|
||||
debug!("put_wal_record {:?} at {}", tag, lsn);
|
||||
self.put_page_entry(&rel, blknum, lsn, PageEntry::WALRecord(rec))?;
|
||||
debug!("put_wal_record {} at {}", rel, lsn);
|
||||
|
||||
if let ObjectTag::RelationBuffer(tag) = tag {
|
||||
if rel.is_blocky() {
|
||||
// Also check if this created or extended the file
|
||||
let old_nblocks = self.relsize_get_nowait(tag.rel, lsn)?.unwrap_or(0);
|
||||
let old_nblocks = self.relsize_get_nowait(rel, lsn)?.unwrap_or(0);
|
||||
|
||||
if tag.blknum >= old_nblocks {
|
||||
let new_nblocks = tag.blknum + 1;
|
||||
if blknum >= old_nblocks {
|
||||
let new_nblocks = blknum + 1;
|
||||
|
||||
trace!(
|
||||
"Extended relation {} from {} to {} blocks at {}",
|
||||
tag.rel,
|
||||
"Extended {} from {} to {} blocks at {}",
|
||||
rel,
|
||||
old_nblocks,
|
||||
new_nblocks,
|
||||
lsn
|
||||
);
|
||||
|
||||
self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
|
||||
self.put_relsize_entry(&rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
|
||||
let mut rel_meta = self.rel_meta.write().unwrap();
|
||||
rel_meta.insert(
|
||||
tag.rel,
|
||||
RelMetadata {
|
||||
rel,
|
||||
RelishMetadata {
|
||||
size: new_nblocks,
|
||||
last_updated: lsn,
|
||||
},
|
||||
@@ -435,23 +488,12 @@ impl Timeline for ObjectTimeline {
|
||||
}
|
||||
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, rel_tag: RelTag, lsn: Lsn) -> Result<()> {
|
||||
fn put_unlink(&self, rel_tag: RelishTag, lsn: Lsn) -> Result<()> {
|
||||
self.put_relsize_entry(&rel_tag, lsn, RelationSizeEntry::Unlink)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Truncate SLRU segment
|
||||
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
};
|
||||
let val = ObjectValue::SLRUTruncate;
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
@@ -478,38 +520,47 @@ impl Timeline for ObjectTimeline {
|
||||
///
|
||||
fn put_page_image(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
img: Bytes,
|
||||
update_meta: bool,
|
||||
) -> Result<()> {
|
||||
self.put_page_entry(&tag, lsn, PageEntry::Page(img))?;
|
||||
if !rel.is_blocky() && blknum != 0 {
|
||||
bail!(
|
||||
"invalid request for block {} for non-blocky relish {}",
|
||||
blknum,
|
||||
rel
|
||||
);
|
||||
}
|
||||
|
||||
debug!("put_page_image rel {:?} at {}", tag, lsn);
|
||||
self.put_page_entry(&rel, blknum, lsn, PageEntry::Page(img))?;
|
||||
|
||||
debug!("put_page_image {} at {}", rel, lsn);
|
||||
|
||||
if !update_meta {
|
||||
return Ok(());
|
||||
}
|
||||
if let ObjectTag::RelationBuffer(tag) = tag {
|
||||
if rel.is_blocky() {
|
||||
// Also check if this created or extended the file
|
||||
let old_nblocks = self.relsize_get_nowait(tag.rel, lsn)?.unwrap_or(0);
|
||||
let old_nblocks = self.relsize_get_nowait(rel, lsn)?.unwrap_or(0);
|
||||
|
||||
if tag.blknum >= old_nblocks {
|
||||
let new_nblocks = tag.blknum + 1;
|
||||
if blknum >= old_nblocks {
|
||||
let new_nblocks = blknum + 1;
|
||||
|
||||
trace!(
|
||||
"Extended relation {} from {} to {} blocks at {}",
|
||||
tag.rel,
|
||||
"Extended {} from {} to {} blocks at {}",
|
||||
rel,
|
||||
old_nblocks,
|
||||
new_nblocks,
|
||||
lsn
|
||||
);
|
||||
|
||||
self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
|
||||
self.put_relsize_entry(&rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
|
||||
let mut rel_meta = self.rel_meta.write().unwrap();
|
||||
rel_meta.insert(
|
||||
tag.rel,
|
||||
RelMetadata {
|
||||
rel,
|
||||
RelishMetadata {
|
||||
size: new_nblocks,
|
||||
last_updated: lsn,
|
||||
},
|
||||
@@ -523,14 +574,18 @@ impl Timeline for ObjectTimeline {
|
||||
/// Adds a relation-wide WAL record (like truncate) to the repository,
|
||||
/// associating it with all pages started with specified block number
|
||||
///
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()> {
|
||||
if !rel.is_blocky() {
|
||||
bail!("invalid truncation for non-blocky relish {}", rel);
|
||||
}
|
||||
|
||||
info!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
|
||||
|
||||
self.put_relsize_entry(&rel, lsn, RelationSizeEntry::Size(nblocks))?;
|
||||
let mut rel_meta = self.rel_meta.write().unwrap();
|
||||
rel_meta.insert(
|
||||
rel,
|
||||
RelMetadata {
|
||||
RelishMetadata {
|
||||
size: nblocks,
|
||||
last_updated: lsn,
|
||||
},
|
||||
@@ -631,173 +686,6 @@ impl Timeline for ObjectTimeline {
|
||||
Ok(Box::new(ObjectHistory { lsn, iter }))
|
||||
}
|
||||
|
||||
fn gc_iteration(&self, horizon: u64, compact: bool) -> Result<GcResult> {
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
let mut result: GcResult = Default::default();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(horizon) {
|
||||
// WAL is large enough to perform GC
|
||||
let now = Instant::now();
|
||||
let mut prepared_horizon = Lsn(u64::MAX);
|
||||
// Iterate through all objects in timeline
|
||||
for obj in self
|
||||
.obj_store
|
||||
.list_objects(self.timelineid, false, last_lsn)?
|
||||
{
|
||||
result.inspected += 1;
|
||||
match obj {
|
||||
// Prepared transactions
|
||||
ObjectTag::TwoPhase(prepare) => {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
prepared_horizon = Lsn::min(lsn, prepared_horizon);
|
||||
if self.get_tx_status(prepare.xid, horizon)?
|
||||
!= pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.prep_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationMetadata(_) => {
|
||||
// Do not need to reconstruct page images,
|
||||
// just delete all old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::RelationSize(RelationSizeEntry::Unlink) => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
result.dropped += 1;
|
||||
}
|
||||
_ => (), // preserve last version
|
||||
}
|
||||
last_version = false;
|
||||
result.truncated += 1;
|
||||
result.n_relations += 1;
|
||||
} else {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationBuffer(tag) => {
|
||||
// Reconstruct page at horizon unless relation was dropped
|
||||
// and delete all older versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
result.truncated += 1;
|
||||
last_version = false;
|
||||
if let Some(rel_size) =
|
||||
self.relsize_get_nowait(tag.rel, last_lsn)?
|
||||
{
|
||||
if rel_size > tag.blknum {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(obj, lsn)?;
|
||||
continue;
|
||||
}
|
||||
debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
|
||||
} else {
|
||||
if let Some(rel_size) =
|
||||
self.relsize_get_nowait(tag.rel, last_lsn)?
|
||||
{
|
||||
debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
|
||||
continue;
|
||||
}
|
||||
debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
|
||||
}
|
||||
// relation was dropped or truncated so this block can be removed
|
||||
}
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
}
|
||||
}
|
||||
// SLRU-s
|
||||
ObjectTag::Clog(_)
|
||||
| ObjectTag::MultiXactOffsets(_)
|
||||
| ObjectTag::MultiXactMembers(_) => {
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self
|
||||
.obj_store
|
||||
.object_versions(&key, Lsn::min(prepared_horizon, horizon))?
|
||||
{
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::SLRUTruncate => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.slru_deleted += 1;
|
||||
}
|
||||
ObjectValue::Page(PageEntry::WALRecord(_)) => {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(obj, lsn)?;
|
||||
}
|
||||
_ => {} // do nothing if already materialized
|
||||
}
|
||||
last_version = false;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.slru_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// versioned always materialized objects: no need to reconstruct pages
|
||||
ObjectTag::Checkpoint | ObjectTag::ControlFile => {
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
// preserve last version
|
||||
last_version = false;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.chkp_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (), // do nothing
|
||||
}
|
||||
}
|
||||
result.elapsed = now.elapsed();
|
||||
info!("Garbage collection completed in {:?}: {} relations inspected, {} object inspected, {} version histories truncated, {} versions deleted, {} relations dropped",
|
||||
result.elapsed, result.n_relations, result.inspected, result.truncated, result.deleted, result.dropped);
|
||||
if compact {
|
||||
self.obj_store.compact();
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl ObjectTimeline {
|
||||
@@ -806,7 +694,7 @@ impl ObjectTimeline {
|
||||
///
|
||||
/// The caller must ensure that WAL has been received up to 'lsn'.
|
||||
///
|
||||
fn relsize_get_nowait(&self, rel: RelTag, lsn: Lsn) -> Result<Option<u32>> {
|
||||
fn relsize_get_nowait(&self, rel: RelishTag, lsn: Lsn) -> Result<Option<u32>> {
|
||||
{
|
||||
let rel_meta = self.rel_meta.read().unwrap();
|
||||
if let Some(meta) = rel_meta.get(&rel) {
|
||||
@@ -854,7 +742,8 @@ impl ObjectTimeline {
|
||||
///
|
||||
fn collect_records_for_apply(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<(Option<Bytes>, Vec<WALRecord>)> {
|
||||
let mut base_img: Option<Bytes> = None;
|
||||
@@ -864,7 +753,7 @@ impl ObjectTimeline {
|
||||
// old page image.
|
||||
let searchkey = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag,
|
||||
tag: ObjectTag::Buffer(rel, blknum),
|
||||
};
|
||||
let mut iter = self.object_versions(&*self.obj_store, &searchkey, lsn)?;
|
||||
while let Some((_key, value)) = iter.next().transpose()? {
|
||||
@@ -966,17 +855,17 @@ impl ObjectTimeline {
|
||||
//
|
||||
// Helper functions to store different kinds of objects to the underlying ObjectStore
|
||||
//
|
||||
fn put_page_entry(&self, tag: &ObjectTag, lsn: Lsn, val: PageEntry) -> Result<()> {
|
||||
fn put_page_entry(&self, tag: &RelishTag, blknum: u32, lsn: Lsn, val: PageEntry) -> Result<()> {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: *tag,
|
||||
tag: ObjectTag::Buffer(*tag, blknum),
|
||||
};
|
||||
let val = ObjectValue::Page(val);
|
||||
|
||||
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
|
||||
}
|
||||
|
||||
fn put_relsize_entry(&self, tag: &RelTag, lsn: Lsn, val: RelationSizeEntry) -> Result<()> {
|
||||
fn put_relsize_entry(&self, tag: &RelishTag, lsn: Lsn, val: RelationSizeEntry) -> Result<()> {
|
||||
let key = relation_size_key(self.timelineid, *tag);
|
||||
let val = ObjectValue::RelationSize(val);
|
||||
|
||||
@@ -989,6 +878,137 @@ impl ObjectTimeline {
|
||||
|
||||
self.obj_store.put(&key, Lsn(0), &ObjectValue::ser(&val)?)
|
||||
}
|
||||
|
||||
fn gc_iteration(&self, horizon: u64, compact: bool) -> Result<GcResult> {
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
let mut result: GcResult = Default::default();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(horizon) {
|
||||
// WAL is large enough to perform GC
|
||||
let now = Instant::now();
|
||||
let mut prepared_horizon = Lsn(u64::MAX);
|
||||
// Iterate through all objects in timeline
|
||||
for obj in self.obj_store.list_objects(self.timelineid, last_lsn)? {
|
||||
result.inspected += 1;
|
||||
match obj {
|
||||
// Prepared transactions
|
||||
ObjectTag::Buffer(RelishTag::TwoPhase { xid }, _blknum) => {
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
prepared_horizon = Lsn::min(lsn, prepared_horizon);
|
||||
if self.get_tx_status(xid, horizon)?
|
||||
!= pg_constants::TRANSACTION_STATUS_IN_PROGRESS
|
||||
{
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.prep_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::RelationMetadata(_) => {
|
||||
// Do not need to reconstruct page images,
|
||||
// just delete all old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
let content = vers.1;
|
||||
match ObjectValue::des(&content[..])? {
|
||||
ObjectValue::RelationSize(RelationSizeEntry::Unlink) => {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
result.dropped += 1;
|
||||
}
|
||||
_ => (), // preserve last version
|
||||
}
|
||||
last_version = false;
|
||||
result.truncated += 1;
|
||||
result.n_relations += 1;
|
||||
} else {
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
ObjectTag::Buffer(rel, blknum) => {
|
||||
if rel.is_blocky() {
|
||||
// Reconstruct page at horizon unless relation was dropped
|
||||
// and delete all older versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
result.truncated += 1;
|
||||
last_version = false;
|
||||
if let Some(rel_size) =
|
||||
self.relsize_get_nowait(rel, last_lsn)?
|
||||
{
|
||||
if rel_size > blknum {
|
||||
// preserve and materialize last version before deleting all preceeding
|
||||
self.get_page_at_lsn_nowait(rel, blknum, lsn)?;
|
||||
continue;
|
||||
}
|
||||
debug!("Drop last block {} of relation {} at {} because it is beyond relation size {}", blknum, rel, lsn, rel_size);
|
||||
} else {
|
||||
if let Some(rel_size) =
|
||||
self.relsize_get_nowait(rel, last_lsn)?
|
||||
{
|
||||
debug!("Preserve block {} of relation {} at {} because relation has size {} at {}", blknum, rel, lsn, rel_size, last_lsn);
|
||||
continue;
|
||||
}
|
||||
debug!("Relation {} was dropped at {}", rel, lsn);
|
||||
}
|
||||
// relation was dropped or truncated so this block can be removed
|
||||
}
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.deleted += 1;
|
||||
}
|
||||
} else {
|
||||
// versioned always materialized objects: no need to reconstruct pages
|
||||
|
||||
// Remove old versions over horizon
|
||||
let mut last_version = true;
|
||||
let key = ObjectKey {
|
||||
timeline: self.timelineid,
|
||||
tag: obj,
|
||||
};
|
||||
for vers in self.obj_store.object_versions(&key, horizon)? {
|
||||
let lsn = vers.0;
|
||||
if last_version {
|
||||
// preserve last version
|
||||
last_version = false;
|
||||
} else {
|
||||
// delete deteriorated version
|
||||
self.obj_store.unlink(&key, lsn)?;
|
||||
result.chkp_deleted += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => (), // do nothing
|
||||
}
|
||||
}
|
||||
result.elapsed = now.elapsed();
|
||||
info!("Garbage collection completed in {:?}: {} relations inspected, {} object inspected, {} version histories truncated, {} versions deleted, {} relations dropped",
|
||||
result.elapsed, result.n_relations, result.inspected, result.truncated, result.deleted, result.dropped);
|
||||
if compact {
|
||||
self.obj_store.compact();
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
struct ObjectHistory<'a> {
|
||||
@@ -1060,7 +1080,7 @@ pub enum RelationSizeEntry {
|
||||
Unlink,
|
||||
}
|
||||
|
||||
const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
|
||||
const fn relation_size_key(timelineid: ZTimelineId, rel: RelishTag) -> ObjectKey {
|
||||
ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(rel),
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Low-level key-value storage abstraction.
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::repository::RelTag;
|
||||
use crate::relish::*;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashSet;
|
||||
@@ -69,6 +69,12 @@ pub trait ObjectStore: Send + Sync {
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Iterate through non-rel relishes
|
||||
///
|
||||
/// This is used to prepare tarball for new node startup.
|
||||
/// Returns objects in increasing key-version order.
|
||||
fn list_nonrels<'a>(&'a self, timelineid: ZTimelineId, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
/// Iterate through objects tags. If nonrel_only, then only non-relationa data is iterated.
|
||||
///
|
||||
/// This is used to implement GC and preparing tarball for new node startup
|
||||
@@ -76,7 +82,6 @@ pub trait ObjectStore: Send + Sync {
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timelineid: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
|
||||
|
||||
@@ -2,11 +2,12 @@
|
||||
//! page server.
|
||||
|
||||
use crate::branches;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{PageServerConf, ZTenantId};
|
||||
use crate::{PageServerConf, RepositoryFormat, ZTenantId};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::info;
|
||||
@@ -26,16 +27,39 @@ pub fn init(conf: &'static PageServerConf) {
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
|
||||
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo =
|
||||
ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
|
||||
let repo: Arc<dyn Repository + Sync + Send> = match conf.repository_format {
|
||||
RepositoryFormat::Layered => {
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
if conf.gc_horizon != 0 {
|
||||
crate::layered_repository::LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
} else {
|
||||
info!("Garbage collection is disabled");
|
||||
}
|
||||
repo
|
||||
}
|
||||
RepositoryFormat::RocksDb => {
|
||||
let obj_store = RocksObjectStore::open(conf, &tenantid).unwrap();
|
||||
|
||||
Arc::new(ObjectRepository::new(
|
||||
conf,
|
||||
Arc::new(obj_store),
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
m.insert(tenantid, Arc::new(repo));
|
||||
m.insert(tenantid, repo);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +76,7 @@ pub fn create_repository_for_tenant(
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
||||
|
||||
m.insert(tenantid, Arc::new(repo));
|
||||
m.insert(tenantid, repo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -28,9 +28,9 @@ use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::branches;
|
||||
use crate::object_key::ObjectTag;
|
||||
use crate::page_cache;
|
||||
use crate::repository::{BufferTag, Modification, RelTag};
|
||||
use crate::relish::*;
|
||||
use crate::repository::Modification;
|
||||
use crate::walreceiver;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTenantId;
|
||||
@@ -206,12 +206,13 @@ impl PageServerHandler {
|
||||
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let tag = RelTag {
|
||||
let rel = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
|
||||
let exist = timeline.get_rel_exists(tag, req.lsn).unwrap_or(false);
|
||||
|
||||
@@ -221,29 +222,28 @@ impl PageServerHandler {
|
||||
})
|
||||
}
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let tag = RelTag {
|
||||
let rel = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
|
||||
let n_blocks = timeline.get_rel_size(tag, req.lsn).unwrap_or(0);
|
||||
|
||||
PagestreamBeMessage::Nblocks(PagestreamStatusResponse { ok: true, n_blocks })
|
||||
}
|
||||
PagestreamFeMessage::Read(req) => {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
},
|
||||
blknum: req.blkno,
|
||||
});
|
||||
let rel = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
|
||||
let read_response = match timeline.get_page_at_lsn(tag, req.lsn) {
|
||||
let read_response = match timeline.get_page_at_lsn(tag, req.blkno, req.lsn) {
|
||||
Ok(p) => PagestreamReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
@@ -431,11 +431,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let modification = Modification::des(&bytes)?;
|
||||
|
||||
last_lsn = modification.lsn;
|
||||
timeline.put_raw_data(
|
||||
modification.tag,
|
||||
last_lsn,
|
||||
&modification.data[..],
|
||||
)?;
|
||||
timeline.put_raw_data(modification.tag, modification.lsn, &modification.data)?;
|
||||
}
|
||||
FeMessage::CopyDone => {
|
||||
timeline.advance_last_valid_lsn(last_lsn);
|
||||
@@ -541,60 +537,31 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.map(|h| h.as_str().parse())
|
||||
.unwrap_or(Ok(self.conf.gc_horizon))?;
|
||||
|
||||
let timeline =
|
||||
page_cache::get_repository_for_tenant(&tenantid)?.get_timeline(timelineid)?;
|
||||
let repo = page_cache::get_repository_for_tenant(&tenantid)?;
|
||||
|
||||
let result = timeline.gc_iteration(gc_horizon, true)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor {
|
||||
name: b"n_relations",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"truncated",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"prep_deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"slru_deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"chkp_deleted",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"dropped",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"elapsed",
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
..Default::default()
|
||||
},
|
||||
RowDescriptor::int8_col(b"n_relations"),
|
||||
RowDescriptor::int8_col(b"truncated"),
|
||||
RowDescriptor::int8_col(b"deleted"),
|
||||
RowDescriptor::int8_col(b"prep_deleted"),
|
||||
RowDescriptor::int8_col(b"slru_deleted"),
|
||||
RowDescriptor::int8_col(b"chkp_deleted"),
|
||||
RowDescriptor::int8_col(b"dropped"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"snapshot_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"snapshot_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(&result.n_relations.to_string().as_bytes()),
|
||||
@@ -604,10 +571,48 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
Some(&result.slru_deleted.to_string().as_bytes()),
|
||||
Some(&result.chkp_deleted.to_string().as_bytes()),
|
||||
Some(&result.dropped.to_string().as_bytes()),
|
||||
Some(&result.snapshot_relfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
&result
|
||||
.snapshot_relfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
&result
|
||||
.snapshot_relfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(&result.snapshot_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(&result.snapshot_relfiles_removed.to_string().as_bytes()),
|
||||
Some(&result.snapshot_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(&result.snapshot_nonrelfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
&result
|
||||
.snapshot_nonrelfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
&result
|
||||
.snapshot_nonrelfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
&result
|
||||
.snapshot_nonrelfiles_not_updated
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(&result.snapshot_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(&result.snapshot_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(&result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
error!("received unknown command from client");
|
||||
bail!("unknown command");
|
||||
}
|
||||
|
||||
|
||||
219
pageserver/src/relish.rs
Normal file
219
pageserver/src/relish.rs
Normal file
@@ -0,0 +1,219 @@
|
||||
//!
|
||||
//! Zenith stores PostgreSQL relations, and some other files, in the
|
||||
//! repository. The relations (i.e. tables and indexes) take up most
|
||||
//! of the space in a typical installation, while the other files are
|
||||
//! small. We call each relation and other file that is stored in the
|
||||
//! repository a "relish". It comes from "rel"-ish, as in "kind of a
|
||||
//! rel", because it covers relations as well as other things that are
|
||||
//! not relations, but are treated similarly for the purposes of the
|
||||
//! storage layer.
|
||||
//!
|
||||
//! This source file contains the definition of the RelishTag struct,
|
||||
//! which uniquely identifies a relish.
|
||||
//!
|
||||
//! Relishes come in two flavors: blocky and non-blocky. Relations and
|
||||
//! SLRUs are blocky, that is, they are divided into 8k blocks, and
|
||||
//! the repository tracks their size. Other relishes are non-blocky:
|
||||
//! the content of the whole relish is stored as one blob. Block
|
||||
//! number must be passed as 0 for all operations on a non-blocky
|
||||
//! relish. The one "block" that you store in a non-blocky relish can
|
||||
//! have arbitrary size, but they are expected to be small, or you
|
||||
//! will have performance issues.
|
||||
//!
|
||||
//! All relishes are versioned by LSN in the repository.
|
||||
//!
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
|
||||
///
|
||||
/// RelishTag identifies one relish.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum RelishTag {
|
||||
// Relations correspond to PostgreSQL relation forks. Each
|
||||
// PostgreSQL relation fork is considered a separate relish.
|
||||
Relation(RelTag),
|
||||
|
||||
// SLRUs include pg_clog, pg_multixact/members, and
|
||||
// pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
|
||||
// they don't need to be stored permanently (e.g. pg_subtrans),
|
||||
// or we do not support them in zenith yet (pg_commit_ts).
|
||||
//
|
||||
// These are currently never requested directly by the compute
|
||||
// nodes, although in principle that would be possible. However,
|
||||
// when a new compute node is created, these are included in the
|
||||
// tarball that we send to the compute node to initialize the
|
||||
// PostgreSQL data directory.
|
||||
//
|
||||
// Each SLRU segment in PostgreSQL is considered a separate
|
||||
// relish. For example, pg_clog/0000, pg_clog/0001, and so forth.
|
||||
//
|
||||
// SLRU segments are divided into blocks, like relations.
|
||||
Slru { slru: SlruKind, segno: u32 },
|
||||
|
||||
// Miscellaneous other files that need to be included in the
|
||||
// tarball at compute node creation. These are non-blocky, and are
|
||||
// expected to be small.
|
||||
|
||||
//
|
||||
// FileNodeMap represents PostgreSQL's 'pg_filenode.map'
|
||||
// files. They are needed to map catalog table OIDs to filenode
|
||||
// numbers. Usually the mapping is done by looking up a relation's
|
||||
// 'relfilenode' field in the 'pg_class' system table, but that
|
||||
// doesn't work for 'pg_class' itself and a few other such system
|
||||
// relations. See PostgreSQL relmapper.c for details.
|
||||
//
|
||||
// Each database has a map file for its local mapped catalogs,
|
||||
// and there is a separate map file for shared catalogs.
|
||||
//
|
||||
// These files are always 512 bytes long (although we don't check
|
||||
// or care about that in the page server).
|
||||
//
|
||||
FileNodeMap { spcnode: Oid, dbnode: Oid },
|
||||
|
||||
//
|
||||
// State files for prepared transactions (e.g pg_twophase/1234)
|
||||
//
|
||||
TwoPhase { xid: TransactionId },
|
||||
|
||||
// The control file, stored in global/pg_control
|
||||
ControlFile,
|
||||
|
||||
// Special entry that represents PostgreSQL checkpoint. It doesn't
|
||||
// correspond to to any physical file in PostgreSQL, but we use it
|
||||
// to track fields needed to restore the checkpoint data in the
|
||||
// control file, when a compute node is created.
|
||||
Checkpoint,
|
||||
}
|
||||
|
||||
impl RelishTag {
|
||||
pub const fn is_blocky(&self) -> bool {
|
||||
match self {
|
||||
// These relishes work with blocks
|
||||
RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true,
|
||||
|
||||
// and these don't
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: _,
|
||||
dbnode: _,
|
||||
}
|
||||
| RelishTag::TwoPhase { xid: _ }
|
||||
| RelishTag::ControlFile
|
||||
| RelishTag::Checkpoint => false,
|
||||
}
|
||||
}
|
||||
|
||||
// convenience function to check if this relish is a normal relation.
|
||||
pub const fn is_relation(&self) -> bool {
|
||||
if let RelishTag::Relation(_) = self {
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
pub dbnode: Oid,
|
||||
pub relnode: Oid,
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelishTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
RelishTag::Relation(rel) => rel.fmt(f),
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
// e.g. pg_clog/0001
|
||||
write!(f, "{}/{:04X}", slru.to_str(), segno)
|
||||
}
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
write!(f, "relmapper file for spc {} db {}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => {
|
||||
write!(f, "pg_twophase/{:08X}", xid)
|
||||
}
|
||||
RelishTag::ControlFile => {
|
||||
write!(f, "control file")
|
||||
}
|
||||
RelishTag::Checkpoint => {
|
||||
write!(f, "checkpoint")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
||||
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
||||
/// hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum SlruKind {
|
||||
Clog,
|
||||
MultiXactMembers,
|
||||
MultiXactOffsets,
|
||||
}
|
||||
|
||||
impl SlruKind {
|
||||
fn to_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Clog => "pg_xact",
|
||||
Self::MultiXactMembers => "pg_multixact/members",
|
||||
Self::MultiXactOffsets => "pg_multixact/offsets",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub const FIRST_NONREL_RELISH_TAG: RelishTag = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: 0,
|
||||
};
|
||||
@@ -1,15 +1,15 @@
|
||||
use crate::object_key::*;
|
||||
use crate::waldecoder::TransactionId;
|
||||
use crate::relish::*;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::TransactionId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::iter::Iterator;
|
||||
use std::ops::AddAssign;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -31,6 +31,20 @@ pub trait Repository: Send + Sync {
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// perform one garbage collection iteration.
|
||||
/// garbage collection is periodically performed by gc thread,
|
||||
/// but it can be explicitly requested through page server api.
|
||||
///
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
/// `compact` parameter is used to force compaction of storage.
|
||||
/// some storage implementation are based on lsm tree and require periodic merge (compaction).
|
||||
/// usually storage implementation determines itself when compaction should be performed.
|
||||
/// but for gc tests it way be useful to force compaction just after completion of gc iteration
|
||||
/// to make sure that all detected garbage is removed.
|
||||
/// so right now `compact` is set to true when gc explicitly requested through page srver api,
|
||||
/// and is st to false in gc threads which infinitely repeats gc iterations in loop.
|
||||
fn gc_iteration(&self, timelineid: Option<ZTimelineId>, horizon: u64, compact: bool) -> Result<GcResult>;
|
||||
|
||||
// TODO get timelines?
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
@@ -40,6 +54,8 @@ pub trait Repository: Send + Sync {
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct GcResult {
|
||||
// FIXME: These counters make sense for the ObjectRepository. They are not used
|
||||
// by the LayeredRepository.
|
||||
pub n_relations: u64,
|
||||
pub inspected: u64,
|
||||
pub truncated: u64,
|
||||
@@ -48,31 +64,73 @@ pub struct GcResult {
|
||||
pub slru_deleted: u64, // SLRU (clog, multixact)
|
||||
pub chkp_deleted: u64, // Checkpoints
|
||||
pub dropped: u64,
|
||||
|
||||
// These are used for the LayeredRepository instead
|
||||
pub snapshot_relfiles_total: u64,
|
||||
pub snapshot_relfiles_needed_by_cutoff: u64,
|
||||
pub snapshot_relfiles_needed_by_branches: u64,
|
||||
pub snapshot_relfiles_not_updated: u64,
|
||||
pub snapshot_relfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
|
||||
pub snapshot_relfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
|
||||
|
||||
pub snapshot_nonrelfiles_total: u64,
|
||||
pub snapshot_nonrelfiles_needed_by_cutoff: u64,
|
||||
pub snapshot_nonrelfiles_needed_by_branches: u64,
|
||||
pub snapshot_nonrelfiles_not_updated: u64,
|
||||
pub snapshot_nonrelfiles_removed: u64, // # of snapshot files removed because they have been made obsolete by newer snapshot files.
|
||||
pub snapshot_nonrelfiles_dropped: u64, // # of snapshot files removed because the relation was dropped
|
||||
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.n_relations += other.n_relations;
|
||||
self.truncated += other.truncated;
|
||||
self.deleted += other.deleted;
|
||||
self.dropped += other.dropped;
|
||||
|
||||
self.snapshot_relfiles_total += other.snapshot_relfiles_total;
|
||||
self.snapshot_relfiles_needed_by_cutoff += other.snapshot_relfiles_needed_by_cutoff;
|
||||
self.snapshot_relfiles_needed_by_branches += other.snapshot_relfiles_needed_by_branches;
|
||||
self.snapshot_relfiles_not_updated += other.snapshot_relfiles_not_updated;
|
||||
self.snapshot_relfiles_removed += other.snapshot_relfiles_removed;
|
||||
self.snapshot_relfiles_dropped += other.snapshot_relfiles_dropped;
|
||||
|
||||
self.snapshot_nonrelfiles_total += other.snapshot_nonrelfiles_total;
|
||||
self.snapshot_nonrelfiles_needed_by_cutoff += other.snapshot_nonrelfiles_needed_by_cutoff;
|
||||
self.snapshot_nonrelfiles_needed_by_branches +=
|
||||
other.snapshot_nonrelfiles_needed_by_branches;
|
||||
self.snapshot_nonrelfiles_not_updated += other.snapshot_nonrelfiles_not_updated;
|
||||
self.snapshot_nonrelfiles_removed += other.snapshot_nonrelfiles_removed;
|
||||
self.snapshot_nonrelfiles_dropped += other.snapshot_nonrelfiles_dropped;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Timeline: Send + Sync {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
|
||||
fn get_page_at_lsn(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn_nowait(&self, tag: ObjectTag, lsn: Lsn) -> Result<Bytes>;
|
||||
fn get_page_at_lsn_nowait(&self, tag: RelishTag, blknum: u32, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Get size of relation
|
||||
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
fn get_rel_size(&self, tag: RelishTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
/// Get a list of all distinct relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
|
||||
|
||||
/// Get a list of non-relational objects
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>>;
|
||||
fn list_nonrels<'a>(&'a self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
@@ -84,24 +142,27 @@ pub trait Timeline: Send + Sync {
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;
|
||||
fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(
|
||||
&self,
|
||||
tag: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
img: Bytes,
|
||||
update_meta: bool,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, tag: RelishTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Put raw data
|
||||
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool)
|
||||
-> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Unlink relation. This method is used for marking dropped relations.
|
||||
fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Truncate SLRU segment
|
||||
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
// Get object tag greater or equal than specified
|
||||
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
|
||||
|
||||
@@ -140,25 +201,20 @@ pub trait Timeline: Send + Sync {
|
||||
// TODO ordering guarantee?
|
||||
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;
|
||||
|
||||
/// Perform one garbage collection iteration.
|
||||
/// Garbage collection is periodically performed by GC thread,
|
||||
/// but it can be explicitly requested through page server API.
|
||||
///
|
||||
/// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
|
||||
/// `compact` parameter is used to force compaction of storage.
|
||||
/// Some storage implementation are based on LSM tree and require periodic merge (compaction).
|
||||
/// Usually storage implementation determines itself when compaction should be performed.
|
||||
/// But for GC tests it way be useful to force compaction just after completion of GC iteration
|
||||
/// to make sure that all detected garbage is removed.
|
||||
/// So right now `compact` is set to true when GC explicitly requested through page srver API,
|
||||
/// and is st to false in GC threads which infinitely repeats GC iterations in loop.
|
||||
fn gc_iteration(&self, horizon: u64, compact: bool) -> Result<GcResult>;
|
||||
|
||||
// Check transaction status
|
||||
fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
|
||||
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let clog_page = self.get_page_at_lsn(tag, lsn)?;
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
let clog_page = self.get_page_at_lsn(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
)?;
|
||||
let status = transaction_id_get_status(xid, &clog_page[..]);
|
||||
Ok(status)
|
||||
}
|
||||
@@ -198,76 +254,6 @@ pub struct RepositoryStats {
|
||||
pub num_getpage_requests: Lsn,
|
||||
}
|
||||
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub const ZEROED: Self = Self {
|
||||
forknum: 0,
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
};
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
/// This is used as a part of the key inside key-value storage (RocksDB currently).
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
impl BufferTag {
|
||||
pub const ZEROED: Self = Self {
|
||||
rel: RelTag::ZEROED,
|
||||
blknum: 0,
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
@@ -308,11 +294,12 @@ impl WALRecord {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::object_repository::ObjectRepository;
|
||||
use crate::object_repository::{ObjectValue, PageEntry, RelationSizeEntry};
|
||||
use crate::rocksdb_storage::RocksObjectStore;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::{PageServerConf, ZTenantId};
|
||||
use crate::{PageServerConf, RepositoryFormat, ZTenantId};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
@@ -321,28 +308,18 @@ mod tests {
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelTag = RelTag {
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
const TESTREL_B: RelTag = RelTag {
|
||||
});
|
||||
const TESTREL_B: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1001,
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
/// Convenience function to create a BufferTag for testing.
|
||||
/// Helps to keeps the tests shorter.
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_BUF(blknum: u32) -> ObjectTag {
|
||||
ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: TESTREL_A,
|
||||
blknum,
|
||||
})
|
||||
}
|
||||
});
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
@@ -354,10 +331,16 @@ mod tests {
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
fn get_test_repo(
|
||||
test_name: &str,
|
||||
repository_format: RepositoryFormat,
|
||||
) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir).unwrap();
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
let conf = PageServerConf {
|
||||
daemonize: false,
|
||||
@@ -367,6 +350,7 @@ mod tests {
|
||||
superuser: "zenith_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
repository_format,
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
@@ -374,33 +358,54 @@ mod tests {
|
||||
let tenantid = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||
|
||||
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo =
|
||||
ObjectRepository::new(conf, Arc::new(obj_store), Arc::new(walredo_mgr), tenantid);
|
||||
let repo: Box<dyn Repository + Sync + Send> = match conf.repository_format {
|
||||
RepositoryFormat::Layered => {
|
||||
Box::new(LayeredRepository::new(conf, Arc::new(walredo_mgr), tenantid))
|
||||
}
|
||||
RepositoryFormat::RocksDb => {
|
||||
let obj_store = RocksObjectStore::create(conf, &tenantid)?;
|
||||
|
||||
Ok(Box::new(repo))
|
||||
Box::new(ObjectRepository::new(
|
||||
conf,
|
||||
Arc::new(obj_store),
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation.
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
fn test_relsize_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_relsize(&*repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_relsize_layered", RepositoryFormat::Layered)?;
|
||||
test_relsize(&*repo)
|
||||
}
|
||||
|
||||
fn test_relsize(repo: &dyn Repository) -> Result<()> {
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 2, Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(5));
|
||||
|
||||
@@ -414,34 +419,34 @@ mod tests {
|
||||
|
||||
// Check page contents at each LSN
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 0, Lsn(2))?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 0, Lsn(3))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 0, Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 1, Lsn(4))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 0, Lsn(5))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 1, Lsn(5))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 2, Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
@@ -452,18 +457,18 @@ mod tests {
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(6))?, 2);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 0, Lsn(6))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 1, Lsn(6))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(5))?, 3);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 2, Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
@@ -476,18 +481,28 @@ mod tests {
|
||||
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
||||
/// any attention to Postgres segment boundaries there.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
fn test_large_rel_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_large_rel(&*repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_large_rel_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel_layered", RepositoryFormat::Layered)?;
|
||||
test_large_rel(&*repo)
|
||||
}
|
||||
|
||||
fn test_large_rel(repo: &dyn Repository) -> Result<()> {
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
|
||||
let mut lsn = 0;
|
||||
for i in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
|
||||
let mut lsn = 1;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
lsn += 1;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
|
||||
tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img, true)?;
|
||||
}
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
|
||||
@@ -529,27 +544,37 @@ mod tests {
|
||||
}))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_branch_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_branch(&*repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_branch_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch_layered", RepositoryFormat::Layered)?;
|
||||
test_branch(&*repo)
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
fn test_branch(repo: &dyn Repository) -> Result<()> {
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(1), ZERO_PAGE.clone(), false)?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("foo blk 0 at 4"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(4), TEST_IMG("foo blk 0 at 4"), true)?;
|
||||
|
||||
// Create another relation
|
||||
let buftag2 = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: TESTREL_B,
|
||||
blknum: 0,
|
||||
});
|
||||
tline.put_page_image(buftag2, Lsn(2), TEST_IMG("foobar blk 0 at 2"), true)?;
|
||||
tline.put_page_image(TESTREL_B, 0, Lsn(2), TEST_IMG("foobar blk 0 at 2"), true)?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(4));
|
||||
|
||||
@@ -558,22 +583,22 @@ mod tests {
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(3))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
newtline.put_page_image(TEST_BUF(0), Lsn(4), TEST_IMG("bar blk 0 at 4"), true)?;
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(4), TEST_IMG("bar blk 0 at 4"), true)?;
|
||||
newtline.advance_last_valid_lsn(Lsn(4));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
tline.get_page_at_lsn(TESTREL_A, 0, Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
newtline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(4))?,
|
||||
TEST_IMG("bar blk 0 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
newtline.get_page_at_lsn(buftag2, Lsn(4))?,
|
||||
newtline.get_page_at_lsn(TESTREL_B, 0, Lsn(4))?,
|
||||
TEST_IMG("foobar blk 0 at 2")
|
||||
);
|
||||
|
||||
@@ -583,8 +608,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_history() -> Result<()> {
|
||||
let repo = get_test_repo("test_snapshot")?;
|
||||
fn test_history_rocksdb() -> Result<()> {
|
||||
let repo = get_test_repo("test_history_rocksdb", RepositoryFormat::RocksDb)?;
|
||||
test_history(&*repo)
|
||||
}
|
||||
#[test]
|
||||
// TODO: This doesn't work with the layered storage, the functions needed for push/pull
|
||||
// functionality haven't been implemented yet.
|
||||
#[ignore]
|
||||
fn test_history_layered() -> Result<()> {
|
||||
let repo = get_test_repo("test_history_layered", RepositoryFormat::Layered)?;
|
||||
test_history(&*repo)
|
||||
}
|
||||
fn test_history(repo: &dyn Repository) -> Result<()> {
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
@@ -595,13 +631,11 @@ mod tests {
|
||||
|
||||
// add a page and advance the last valid LSN
|
||||
let rel = TESTREL_A;
|
||||
let tag = TEST_BUF(1);
|
||||
|
||||
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
|
||||
tline.put_page_image(rel, 1, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
|
||||
tline.advance_last_valid_lsn(Lsn(1));
|
||||
|
||||
let expected_page = Modification {
|
||||
tag,
|
||||
tag: ObjectTag::Buffer(rel, 1),
|
||||
lsn: Lsn(1),
|
||||
data: ObjectValue::ser(&ObjectValue::Page(PageEntry::Page(TEST_IMG(
|
||||
"blk 1 @ lsn 1",
|
||||
@@ -665,14 +699,16 @@ mod tests {
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {:?} to get to {}, with {} and {} records",
|
||||
tag,
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
rel,
|
||||
blknum,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
|
||||
@@ -14,17 +14,20 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
use crate::waldecoder::*;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
const MAX_MBR_BLKNO: u32 =
|
||||
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
|
||||
const ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
///
|
||||
@@ -41,21 +44,21 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => {
|
||||
import_nonrel_file(timeline, lsn, ObjectTag::ControlFile, &direntry.path())?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::ControlFile, &direntry.path())?;
|
||||
// Extract checkpoint record from pg_control and store is as separate object
|
||||
let pg_control_bytes =
|
||||
timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, lsn)?;
|
||||
timeline.get_page_at_lsn_nowait(RelishTag::ControlFile, 0, lsn)?;
|
||||
let pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, lsn, checkpoint_bytes, false)?;
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes, false)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
lsn,
|
||||
ObjectTag::FileNodeMap(DatabaseTag {
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
}),
|
||||
},
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
@@ -92,10 +95,10 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
timeline,
|
||||
lsn,
|
||||
ObjectTag::FileNodeMap(DatabaseTag {
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode: dboid,
|
||||
}),
|
||||
},
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
@@ -112,43 +115,24 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(
|
||||
timeline,
|
||||
lsn,
|
||||
|blknum| ObjectTag::Clog(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(
|
||||
timeline,
|
||||
lsn,
|
||||
|blknum| ObjectTag::MultiXactMembers(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(
|
||||
timeline,
|
||||
lsn,
|
||||
|blknum| ObjectTag::MultiXactOffsets(SlruBufferTag { blknum }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(
|
||||
timeline,
|
||||
lsn,
|
||||
ObjectTag::TwoPhase(PrepareTag { xid }),
|
||||
&entry.path(),
|
||||
)?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
timeline.advance_last_valid_lsn(lsn);
|
||||
timeline.checkpoint()?;
|
||||
|
||||
Ok(())
|
||||
@@ -179,16 +163,14 @@ fn import_relfile(
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
});
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf), true)?;
|
||||
let rel = RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf), true)?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
@@ -210,10 +192,16 @@ fn import_relfile(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Import a "non-blocky" file into the repository
|
||||
///
|
||||
/// This is used for small files like the control file, twophase files etc. that
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
tag: ObjectTag,
|
||||
tag: RelishTag,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
@@ -221,31 +209,34 @@ fn import_nonrel_file(
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
|
||||
info!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn import_slru_file(
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
gen_tag: fn(blknum: u32) -> ObjectTag,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
info!("importing slru file {}", path.display());
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
timeline.put_page_image(
|
||||
gen_tag(blknum),
|
||||
RelishTag::Slru { slru, segno },
|
||||
rpageno,
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buf),
|
||||
false,
|
||||
true,
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -262,7 +253,9 @@ fn import_slru_file(
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
rpageno += 1;
|
||||
|
||||
// TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -277,7 +270,7 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint)?;
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(RelishTag::Checkpoint, 0, startpoint)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
loop {
|
||||
@@ -341,11 +334,10 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
|
||||
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, last_lsn, checkpoint_bytes, false)?;
|
||||
|
||||
timeline.advance_last_valid_lsn(last_lsn);
|
||||
timeline.checkpoint()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -365,14 +357,11 @@ pub fn save_decoded_record(
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
let tag = RelishTag::Relation(RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
});
|
||||
|
||||
let rec = WALRecord {
|
||||
@@ -382,7 +371,7 @@ pub fn save_decoded_record(
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
timeline.put_wal_record(tag, blk.blkno, rec)?;
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
@@ -405,37 +394,25 @@ pub fn save_decoded_record(
|
||||
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let blknum = buf.get_u32_le();
|
||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
let rec = WALRecord {
|
||||
let pageno = buf.get_u32_le();
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_page_image(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
ZERO_PAGE,
|
||||
true,
|
||||
)?;
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
checkpoint.oldestXid = buf.get_u32_le();
|
||||
checkpoint.oldestXidDB = buf.get_u32_le();
|
||||
trace!(
|
||||
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
|
||||
blknum,
|
||||
checkpoint.oldestXid,
|
||||
checkpoint.oldestXidDB
|
||||
);
|
||||
if let Some(ObjectTag::Clog(first_slru_tag)) =
|
||||
timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
|
||||
{
|
||||
for trunc_blknum in first_slru_tag.blknum..=blknum {
|
||||
let tag = ObjectTag::Clog(SlruBufferTag {
|
||||
blknum: trunc_blknum,
|
||||
});
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
}
|
||||
let xlrec = XlClogTruncate::decode(&mut buf);
|
||||
save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
@@ -454,30 +431,44 @@ pub fn save_decoded_record(
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
ObjectTag::TwoPhase(PrepareTag {
|
||||
RelishTag::TwoPhase {
|
||||
xid: decoded.xl_xid,
|
||||
}),
|
||||
},
|
||||
0,
|
||||
rec,
|
||||
)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
let blknum = buf.get_u32_le();
|
||||
let rec = WALRecord {
|
||||
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
let pageno = buf.get_u32_le();
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_page_image(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
will_init: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
ObjectTag::MultiXactOffsets(SlruBufferTag { blknum })
|
||||
} else {
|
||||
ObjectTag::MultiXactMembers(SlruBufferTag { blknum })
|
||||
};
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
ZERO_PAGE,
|
||||
true,
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
|
||||
let pageno = buf.get_u32_le();
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_page_image(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
lsn,
|
||||
ZERO_PAGE,
|
||||
true,
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
@@ -541,7 +532,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
assert_eq!(src_rel.spcnode, src_tablespace_id);
|
||||
assert_eq!(src_rel.dbnode, src_db_id);
|
||||
|
||||
let nblocks = timeline.get_rel_size(src_rel, req_lsn)?;
|
||||
let nblocks = timeline.get_rel_size(RelishTag::Relation(src_rel), req_lsn)?;
|
||||
let dst_rel = RelTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
@@ -551,26 +542,18 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
|
||||
// Copy content
|
||||
for blknum in 0..nblocks {
|
||||
let src_key = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: src_rel,
|
||||
blknum,
|
||||
});
|
||||
let dst_key = ObjectTag::RelationBuffer(BufferTag {
|
||||
rel: dst_rel,
|
||||
blknum,
|
||||
});
|
||||
let content =
|
||||
timeline.get_page_at_lsn_nowait(RelishTag::Relation(src_rel), blknum, req_lsn)?;
|
||||
|
||||
let content = timeline.get_page_at_lsn_nowait(src_key, req_lsn)?;
|
||||
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
|
||||
|
||||
debug!("copying block {:?} to {:?}", src_key, dst_key);
|
||||
|
||||
timeline.put_page_image(dst_key, lsn, content, true)?;
|
||||
timeline.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content, true)?;
|
||||
num_blocks_copied += 1;
|
||||
}
|
||||
|
||||
if nblocks == 0 {
|
||||
// make sure we have some trace of the relation, even if it's empty
|
||||
timeline.put_truncation(dst_rel, lsn, 0)?;
|
||||
timeline.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?;
|
||||
}
|
||||
|
||||
num_rels_copied += 1;
|
||||
@@ -578,14 +561,14 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
|
||||
// Copy relfilemap
|
||||
for tag in timeline.list_nonrels(req_lsn)? {
|
||||
match tag {
|
||||
ObjectTag::FileNodeMap(db) => {
|
||||
if db.spcnode == src_tablespace_id && db.dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, req_lsn)?;
|
||||
let new_tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
if spcnode == src_tablespace_id && dbnode == src_db_id {
|
||||
let img = timeline.get_page_at_lsn_nowait(tag, 0, req_lsn)?;
|
||||
let new_tag = RelishTag::FileNodeMap {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
});
|
||||
timeline.put_page_image(new_tag, lsn, img, false)?;
|
||||
};
|
||||
timeline.put_page_image(new_tag, 0, lsn, img, false)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -614,7 +597,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
|
||||
relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
};
|
||||
timeline.put_truncation(rel, lsn, rec.blkno)?;
|
||||
timeline.put_truncation(RelishTag::Relation(rel), lsn, rec.blkno)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
|
||||
let rel = RelTag {
|
||||
@@ -637,7 +620,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
|
||||
info!("Partial truncation of FSM is not supported");
|
||||
}
|
||||
let num_fsm_blocks = 0;
|
||||
timeline.put_truncation(rel, lsn, num_fsm_blocks)?;
|
||||
timeline.put_truncation(RelishTag::Relation(rel), lsn, num_fsm_blocks)?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
|
||||
let rel = RelTag {
|
||||
@@ -656,7 +639,7 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
|
||||
info!("Partial truncation of VM is not supported");
|
||||
}
|
||||
let num_vm_blocks = 0;
|
||||
timeline.put_truncation(rel, lsn, num_vm_blocks)?;
|
||||
timeline.put_truncation(RelishTag::Relation(rel), lsn, num_vm_blocks)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -671,38 +654,94 @@ fn save_xact_record(
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
// Record update of CLOG page
|
||||
let mut blknum = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
timeline.put_wal_record(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
rec.clone(),
|
||||
)?;
|
||||
|
||||
for subxact in &parsed.subxacts {
|
||||
let subxact_blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if subxact_blknum != blknum {
|
||||
blknum = subxact_blknum;
|
||||
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if subxact_pageno != pageno {
|
||||
pageno = subxact_pageno;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
rec.clone(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
for xnode in &parsed.xnodes {
|
||||
for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
|
||||
let rel_tag = RelTag {
|
||||
let rel = RelTag {
|
||||
forknum,
|
||||
spcnode: xnode.spcnode,
|
||||
dbnode: xnode.dbnode,
|
||||
relnode: xnode.relnode,
|
||||
};
|
||||
timeline.put_unlink(rel_tag, lsn)?;
|
||||
timeline.put_unlink(RelishTag::Relation(rel), lsn)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_clog_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
_timeline: &dyn Timeline,
|
||||
_lsn: Lsn,
|
||||
xlrec: &XlClogTruncate,
|
||||
) -> Result<()> {
|
||||
trace!(
|
||||
"RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
|
||||
xlrec.pageno,
|
||||
xlrec.oldest_xid,
|
||||
xlrec.oldest_xid_db
|
||||
);
|
||||
|
||||
checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
|
||||
// FIXME: Handle XID wraparound! I just commented this out,
|
||||
// because it was wrong in a dangerous way. But what this should
|
||||
// now do is identify the CLOG segments in the repository that are
|
||||
// older than the threshold in the WAL recor - taking XID
|
||||
// wraparound into account like the corresponding PostgreSQL code
|
||||
// does! - and call put_unlink() for the segments that are no
|
||||
// longer needed.
|
||||
|
||||
/*
|
||||
if let Some(ObjectTag::Clog(first_slru_tag)) =
|
||||
timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
|
||||
{
|
||||
for trunc_blknum in first_slru_tag.blknum..=pageno {
|
||||
let tag = ObjectTag::Clog(SlruBufferTag {
|
||||
blknum: trunc_blknum,
|
||||
});
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
}
|
||||
*/
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
@@ -716,31 +755,47 @@ fn save_multixact_create_record(
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
rec.clone(),
|
||||
)?;
|
||||
|
||||
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno =
|
||||
let first_mbr_pageno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_pageno =
|
||||
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
let mut pageno = first_mbr_pageno;
|
||||
loop {
|
||||
// Update members page
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_wal_record(tag, rec.clone())?;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
},
|
||||
rpageno,
|
||||
rec.clone(),
|
||||
)?;
|
||||
|
||||
if blknum == last_mbr_blkno {
|
||||
if pageno == last_mbr_pageno {
|
||||
// last block inclusive
|
||||
break;
|
||||
}
|
||||
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
if pageno == MAX_MBR_BLKNO {
|
||||
pageno = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
pageno += 1;
|
||||
}
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
@@ -760,6 +815,18 @@ fn save_multixact_create_record(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
const MaxMultiXactOffset: u32 = 0xFFFFFFFF;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
const fn MXOffsetToMemberPage(xid: u32) -> u32 {
|
||||
xid / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32
|
||||
}
|
||||
#[allow(non_snake_case)]
|
||||
const fn MXOffsetToMemberSegment(xid: u32) -> i32 {
|
||||
(MXOffsetToMemberPage(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
timeline: &dyn Timeline,
|
||||
@@ -768,31 +835,35 @@ fn save_multixact_truncate_record(
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
let first_off_blkno = xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let last_off_blkno = xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
|
||||
// PerformMembersTruncation
|
||||
let maxsegment: i32 = MXOffsetToMemberSegment(MaxMultiXactOffset);
|
||||
let startsegment: i32 = MXOffsetToMemberSegment(xlrec.start_trunc_memb);
|
||||
let endsegment: i32 = MXOffsetToMemberSegment(xlrec.end_trunc_memb);
|
||||
let mut segment: i32 = startsegment;
|
||||
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
for blknum in first_off_blkno..last_off_blkno {
|
||||
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
}
|
||||
let first_mbr_blkno = xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let last_mbr_blkno = xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
// The members SLRU can, in contrast to the offsets one, be filled to almost
|
||||
// the full range at once. So we need to handle wraparound.
|
||||
let mut blknum = first_mbr_blkno;
|
||||
// Delete all the segments but the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while blknum != last_mbr_blkno {
|
||||
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
|
||||
timeline.put_slru_truncate(tag, lsn)?;
|
||||
// handle wraparound
|
||||
if blknum == MAX_MBR_BLKNO {
|
||||
blknum = 0;
|
||||
while segment != endsegment {
|
||||
timeline.put_unlink(
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: segment as u32,
|
||||
},
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
/* move to next segment, handling wraparound correctly */
|
||||
if segment == maxsegment {
|
||||
segment = 0;
|
||||
} else {
|
||||
blknum += 1;
|
||||
segment += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Truncate offsets
|
||||
// FIXME: this did not handle wraparound correctly
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -808,10 +879,10 @@ fn save_relmap_record(
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
let tag = ObjectTag::FileNodeMap(DatabaseTag {
|
||||
let tag = RelishTag::FileNodeMap {
|
||||
spcnode: xlrec.tsid,
|
||||
dbnode: xlrec.dbid,
|
||||
});
|
||||
timeline.put_wal_record(tag, rec)?;
|
||||
};
|
||||
timeline.put_wal_record(tag, 0, rec)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//!
|
||||
use crate::object_key::*;
|
||||
use crate::object_store::ObjectStore;
|
||||
use crate::repository::RelTag;
|
||||
use crate::relish::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTenantId;
|
||||
use crate::ZTimelineId;
|
||||
@@ -144,10 +144,9 @@ impl ObjectStore for RocksObjectStore {
|
||||
fn list_objects<'a>(
|
||||
&'a self,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
|
||||
let iter = RocksObjectIter::new(&self.db, timeline, nonrel_only, lsn)?;
|
||||
let iter = RocksObjectIter::new(&self.db, timeline, lsn)?;
|
||||
Ok(Box::new(iter))
|
||||
}
|
||||
|
||||
@@ -179,7 +178,7 @@ impl ObjectStore for RocksObjectStore {
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::RelationMetadata(search_rel_tag),
|
||||
tag: ObjectTag::RelationMetadata(RelishTag::Relation(search_rel_tag)),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
@@ -189,7 +188,7 @@ impl ObjectStore for RocksObjectStore {
|
||||
}
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
|
||||
if let ObjectTag::RelationMetadata(RelishTag::Relation(rel_tag)) = key.obj_key.tag {
|
||||
if spcnode != 0 && rel_tag.spcnode != spcnode
|
||||
|| dbnode != 0 && rel_tag.dbnode != dbnode
|
||||
{
|
||||
@@ -212,6 +211,48 @@ impl ObjectStore for RocksObjectStore {
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Get a list of all distinct NON-relations in timeline
|
||||
///
|
||||
/// TODO: This implementation is very inefficient, it scans
|
||||
/// through all non-rel page versions in the system. In practice, this
|
||||
/// is used when initializing a new compute node, and the non-rel files
|
||||
/// are never very large nor change very frequently, so this will do for now.
|
||||
fn list_nonrels(&self, timelineid: ZTimelineId, lsn: Lsn) -> Result<HashSet<RelishTag>> {
|
||||
let mut rels: HashSet<RelishTag> = HashSet::new();
|
||||
|
||||
let search_key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline: timelineid,
|
||||
tag: ObjectTag::Buffer(FIRST_NONREL_RELISH_TAG, 0),
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(search_key.ser()?);
|
||||
while iter.valid() {
|
||||
let key = StorageKey::des(iter.key().unwrap())?;
|
||||
|
||||
if key.obj_key.timeline != timelineid {
|
||||
// reached end of this timeline in the store
|
||||
break;
|
||||
}
|
||||
|
||||
if let ObjectTag::Buffer(rel_tag, _blknum) = key.obj_key.tag {
|
||||
if key.lsn <= lsn {
|
||||
// visible in this snapshot
|
||||
rels.insert(rel_tag);
|
||||
}
|
||||
}
|
||||
// TODO: we could skip to next relation here like we do in list_rels(),
|
||||
// but hopefully there are not that many SLRU segments or other non-rel
|
||||
// entries for it to matter.
|
||||
iter.next();
|
||||
}
|
||||
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Iterate through versions of all objects in a timeline.
|
||||
///
|
||||
/// Returns objects in increasing key-version order.
|
||||
@@ -387,17 +428,11 @@ impl<'r> RocksObjects<'r> {
|
||||
struct RocksObjectIter<'a> {
|
||||
timeline: ZTimelineId,
|
||||
key: StorageKey,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
dbiter: rocksdb::DBRawIterator<'a>,
|
||||
}
|
||||
impl<'a> RocksObjectIter<'a> {
|
||||
fn new(
|
||||
db: &'a rocksdb::DB,
|
||||
timeline: ZTimelineId,
|
||||
nonrel_only: bool,
|
||||
lsn: Lsn,
|
||||
) -> Result<RocksObjectIter<'a>> {
|
||||
fn new(db: &'a rocksdb::DB, timeline: ZTimelineId, lsn: Lsn) -> Result<RocksObjectIter<'a>> {
|
||||
let key = StorageKey {
|
||||
obj_key: ObjectKey {
|
||||
timeline,
|
||||
@@ -409,7 +444,6 @@ impl<'a> RocksObjectIter<'a> {
|
||||
Ok(RocksObjectIter {
|
||||
key,
|
||||
timeline,
|
||||
nonrel_only,
|
||||
lsn,
|
||||
dbiter,
|
||||
})
|
||||
@@ -433,15 +467,7 @@ impl<'a> Iterator for RocksObjectIter<'a> {
|
||||
self.key.lsn = Lsn(u64::MAX); // next seek should skip all versions
|
||||
if key.lsn <= self.lsn {
|
||||
// visible in this snapshot
|
||||
if self.nonrel_only {
|
||||
match key.obj_key.tag {
|
||||
ObjectTag::RelationMetadata(_) => return None,
|
||||
ObjectTag::RelationBuffer(_) => return None,
|
||||
_ => return Some(key.obj_key.tag),
|
||||
}
|
||||
} else {
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
return Some(key.obj_key.tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,12 +9,11 @@ use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::XLogLongPageHeaderData;
|
||||
use postgres_ffi::XLogPageHeaderData;
|
||||
use postgres_ffi::XLogRecord;
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type TransactionId = u32;
|
||||
pub type BlockNumber = u32;
|
||||
pub type OffsetNumber = u16;
|
||||
pub type MultiXactId = TransactionId;
|
||||
@@ -496,6 +495,24 @@ impl XlXactParsedRecord {
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
impl XlClogTruncate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlClogTruncate {
|
||||
XlClogTruncate {
|
||||
pageno: buf.get_u32_le(),
|
||||
oldest_xid: buf.get_u32_le(),
|
||||
oldest_xid_db: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct MultiXactMember {
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
//!
|
||||
//! We keep one WAL receiver active per timeline.
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::page_cache;
|
||||
use crate::relish::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
@@ -171,7 +171,7 @@ fn walreceiver_main(
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint)?;
|
||||
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(RelishTag::Checkpoint, 0, startpoint)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
@@ -215,7 +215,8 @@ fn walreceiver_main(
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
ObjectTag::Checkpoint,
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
lsn,
|
||||
new_checkpoint_bytes,
|
||||
false,
|
||||
@@ -253,7 +254,7 @@ fn walreceiver_main(
|
||||
tenantid,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
if newest_segno - oldest_segno >= 1 {
|
||||
timeline.checkpoint()?;
|
||||
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -36,8 +37,7 @@ use tokio::time::timeout;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::object_key::*;
|
||||
use crate::repository::BufferTag;
|
||||
use crate::relish::*;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder::XlXactParsedRecord;
|
||||
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
|
||||
@@ -47,6 +47,19 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::XLogRecord;
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
/// This is used as a part of the key inside key-value storage (RocksDB currently).
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// WAL Redo Manager is responsible for replaying WAL records.
|
||||
///
|
||||
@@ -60,7 +73,8 @@ pub trait WalRedoManager: Send + Sync {
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
@@ -76,7 +90,8 @@ pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_tag: ObjectTag,
|
||||
_rel: RelishTag,
|
||||
_blknum: u32,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<WALRecord>,
|
||||
@@ -107,7 +122,8 @@ struct PostgresRedoManagerInternal {
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequest {
|
||||
tag: ObjectTag,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
|
||||
base_img: Option<Bytes>,
|
||||
@@ -173,7 +189,8 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: ObjectTag,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
@@ -182,7 +199,8 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
|
||||
|
||||
let request = WalRedoRequest {
|
||||
tag,
|
||||
rel,
|
||||
blknum,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
@@ -274,7 +292,8 @@ impl PostgresRedoManagerInternal {
|
||||
process: &PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let tag = request.tag;
|
||||
let rel = request.rel;
|
||||
let blknum = request.blknum;
|
||||
let lsn = request.lsn;
|
||||
let base_img = request.base_img.clone();
|
||||
let records = &request.records;
|
||||
@@ -284,11 +303,11 @@ impl PostgresRedoManagerInternal {
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if let ObjectTag::RelationBuffer(buf_tag) = tag {
|
||||
if let RelishTag::Relation(rel) = rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
} else {
|
||||
// Non-relational WAL records we apply ourselves.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
@@ -314,25 +333,24 @@ impl PostgresRedoManagerInternal {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
// The only operation we need to implement is CLOG_ZEROPAGE
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let tag_blknum = match tag {
|
||||
ObjectTag::Clog(slru) => slru.blknum,
|
||||
ObjectTag::TwoPhase(_) => {
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
if slru != SlruKind::Clog {
|
||||
panic!("Not valid XACT relish tag {:?}", rel);
|
||||
}
|
||||
segno
|
||||
}
|
||||
RelishTag::TwoPhase { xid: _ } => {
|
||||
assert!(info == pg_constants::XLOG_XACT_PREPARE);
|
||||
trace!("Apply prepare {} record", xlogrec.xl_xid);
|
||||
page.clear();
|
||||
page.extend_from_slice(&buf[..]);
|
||||
continue;
|
||||
}
|
||||
_ => panic!("Not valid XACT object tag {:?}", tag),
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
@@ -345,9 +363,11 @@ impl PostgresRedoManagerInternal {
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
|
||||
@@ -364,9 +384,11 @@ impl PostgresRedoManagerInternal {
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if tag_blknum == blkno {
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
@@ -376,45 +398,53 @@ impl PostgresRedoManagerInternal {
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multiexact operations
|
||||
// Multixact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
// Just need to zero page
|
||||
page.copy_from_slice(&ZERO_PAGE);
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if let ObjectTag::MultiXactMembers(slru) = tag {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let blkno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
if blkno == slru.blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &=
|
||||
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
|
||||
if let RelishTag::Slru {
|
||||
slru,
|
||||
segno: rec_segno,
|
||||
} = rel
|
||||
{
|
||||
if slru == SlruKind::MultiXactMembers {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let pageno =
|
||||
i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if segno == rec_segno && rpageno == blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1
|
||||
<< pg_constants::MXACT_MEMBER_BITS_PER_XACT)
|
||||
- 1)
|
||||
<< bshift);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid
|
||||
% pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
panic!();
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import pytest
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
import psycopg2.extras
|
||||
@@ -10,6 +11,7 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
# This test is pretty tightly coupled with the current implementation of page version storage
|
||||
# and garbage collection in object_repository.rs.
|
||||
#
|
||||
@pytest.mark.skip(reason="This test only works with the RocksDB implementation")
|
||||
def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
zenith_cli.run(["branch", "test_gc", "empty"])
|
||||
pg = postgres.create_start('test_gc')
|
||||
@@ -48,8 +50,8 @@ def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 3
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 4
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
print("Inserting two more rows and running GC")
|
||||
@@ -61,8 +63,8 @@ def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 2
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 4
|
||||
|
||||
# Insert one more row. It creates one more page version, but doesn't affect the
|
||||
# relation size.
|
||||
@@ -74,8 +76,8 @@ def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['deleted'] == 1
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 2
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
@@ -83,7 +85,7 @@ def test_gc(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
|
||||
assert row['n_relations'] == n_relations
|
||||
assert row['dropped'] == 0
|
||||
assert row['truncated'] == 30
|
||||
assert row['truncated'] == 31
|
||||
assert row['deleted'] == 0
|
||||
|
||||
#
|
||||
|
||||
122
test_runner/batch_others/test_snapfiles_gc.py
Normal file
122
test_runner/batch_others/test_snapfiles_gc.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
import time;
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def print_gc_result(row):
|
||||
print("GC duration {elapsed} ms".format_map(row));
|
||||
print(" REL total: {snapshot_relfiles_total}, needed_by_cutoff {snapshot_relfiles_needed_by_cutoff}, needed_by_branches: {snapshot_relfiles_needed_by_branches}, not_updated: {snapshot_relfiles_not_updated}, removed: {snapshot_relfiles_removed}, dropped: {snapshot_relfiles_dropped}".format_map(row))
|
||||
print(" NONREL total: {snapshot_nonrelfiles_total}, needed_by_cutoff {snapshot_nonrelfiles_needed_by_cutoff}, needed_by_branches: {snapshot_nonrelfiles_needed_by_branches}, not_updated: {snapshot_nonrelfiles_not_updated}, removed: {snapshot_nonrelfiles_removed}, dropped: {snapshot_nonrelfiles_dropped}".format_map(row))
|
||||
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old snapshot files
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of layered
|
||||
# storage, in layered_repository.rs.
|
||||
#
|
||||
def test_snapfiles_gc(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_snapfiles_gc", "empty"])
|
||||
pg = postgres.create_start('test_snapfiles_gc')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
with closing(pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create a test table
|
||||
cur.execute("CREATE TABLE foo(x integer)")
|
||||
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass");
|
||||
row = cur.fetchone();
|
||||
print("relfilenode is {}", row[0]);
|
||||
|
||||
# Run GC, to clear out any garbage left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
# before running the actual tests below, otherwise the counts won't match
|
||||
# what we expect.
|
||||
#
|
||||
# Also run vacuum first to make it less likely that autovacuum or pruning
|
||||
# kicks in and confuses our numbers.
|
||||
cur.execute("VACUUM")
|
||||
|
||||
print("Running GC before test")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
# remember the number of files
|
||||
snapshot_relfiles_remain = row['snapshot_relfiles_total'] - row['snapshot_relfiles_removed']
|
||||
assert snapshot_relfiles_remain > 0
|
||||
|
||||
# Insert a row. The first insert will also create a metadata entry for the
|
||||
# relation, with size == 1 block. Hence, bump up the expected relation count.
|
||||
snapshot_relfiles_remain += 1;
|
||||
print("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
|
||||
assert row['snapshot_relfiles_removed'] == 0
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
# This should create a new snapshot file with the new contents, and
|
||||
# remove the old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||
assert row['snapshot_relfiles_removed'] == 1
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
# Do it again. Should again create a new snapshot file and remove old one.
|
||||
print("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain + 1
|
||||
assert row['snapshot_relfiles_removed'] == 1
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
print("Run GC again, with nothing to do")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
assert row['snapshot_relfiles_total'] == snapshot_relfiles_remain
|
||||
assert row['snapshot_relfiles_removed'] == 0
|
||||
assert row['snapshot_relfiles_dropped'] == 0
|
||||
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
print("Drop table and run GC again");
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row);
|
||||
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['snapshot_relfiles_dropped'] == 3
|
||||
|
||||
# The catalog updates also create new snapshot files of the catalogs, which
|
||||
# are counted as 'removed'
|
||||
assert row['snapshot_relfiles_removed'] > 0
|
||||
|
||||
# TODO: perhaps we should count catalog and user relations separately,
|
||||
# to make this kind of testing more robust
|
||||
@@ -231,10 +231,11 @@ impl PostgresBackend {
|
||||
}
|
||||
|
||||
Some(FeMessage::Query(m)) => {
|
||||
trace!("got query {:?}", m.body);
|
||||
info!("got query {:?}", m.body);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
if let Err(e) = handler.process_query(self, m.body) {
|
||||
let errmsg = format!("{}", e);
|
||||
error!("process_query errored: {}", errmsg);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(errmsg))?;
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
@@ -379,6 +379,21 @@ impl Default for RowDescriptor<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl RowDescriptor<'_> {
|
||||
/// Convenience function to create a RowDescriptor message for an int8 column
|
||||
pub const fn int8_col(name: &[u8]) -> RowDescriptor {
|
||||
RowDescriptor {
|
||||
name,
|
||||
tableoid: 0,
|
||||
attnum: 0,
|
||||
typoid: 20,
|
||||
typlen: 8,
|
||||
typmod: 0,
|
||||
formatcode: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct XLogDataBody<'a> {
|
||||
pub wal_start: u64,
|
||||
|
||||
Reference in New Issue
Block a user