Compare commits

..

16 Commits

Author SHA1 Message Date
Konstantin Knizhnik
600588034b Store page image with the same LSN as replaced WAL record 2021-07-09 12:06:46 +03:00
Konstantin Knizhnik
9f015bdc60 Include zenith.signal file in tarball 2021-06-21 16:01:10 +03:00
Konstantin Knizhnik
c564272142 [refer #258] Handle CHECKOINT_ONLINE WAL record 2021-06-18 23:55:59 +03:00
Konstantin Knizhnik
14a0ae5456 Start compute node without generation of new WAL segment 2021-06-17 22:55:13 +03:00
Konstantin Knizhnik
0d9805a505 Handle non-relational data in PUSH command 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
87fce3fcd5 1. Always start repliction from the begging of WAL segment (to be able to skip missed segments)
2. Do not materialize always last version of objects in GC (only when needed)
3. Fix history test
4. Fix CPU consumption in wal_keeper when connection is broken
5. Fix handling of --recall parameter in walkeeper
2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
04e1ee5ce3 Correctly handle Unlink message in GC for SLRU 2021-06-17 22:49:44 +03:00
anastasia
ec675bbdd6 use correct req_lsn when gathering basebackup tar 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
a3b70745a9 Support collecting nonrel object in GC 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
41c352be7f Batch nextXid updates in object storage checkpoints 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
8a5fcf52c0 Remove special handling of shared catalogs 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
abb114decd Fix review issues:
- Add comments
- Handle members multixact wraparoud
- Extract WAL segment creation (pg_resetwal) to postgres_ffi
- Fix adding prepared 2PC files to basebackup
2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
e7587ceb81 Makecheckpoint and pg_control records in object storage also versioned 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
6d38b9ce6a Fix unit tests after merging 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
47ef9c7ef4 Fix various bugs caused by switch to new storage model 2021-06-17 22:49:44 +03:00
Konstantin Knizhnik
d73cb49f89 Support non-rel objects 2021-06-17 22:49:44 +03:00
43 changed files with 1505 additions and 1911 deletions

View File

@@ -37,7 +37,7 @@ jobs:
command: |
if [ ! -e tmp_install/bin/postgres ]; then
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison
fi
# Build postgres if the restore_cache didn't find a build.

View File

@@ -35,7 +35,7 @@ jobs:
- name: Install postgres dependencies
run: |
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison
- name: Set pg revision for caching
id: pg_ver

3
Cargo.lock generated
View File

@@ -1,7 +1,5 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.4.7"
@@ -2473,7 +2471,6 @@ dependencies = [
"bytes",
"hex-literal",
"log",
"postgres",
"serde",
"thiserror",
"workspace_hack",

View File

@@ -8,8 +8,3 @@ members = [
"zenith_utils",
"workspace_hack",
]
[profile.release]
# This is useful for profiling and, to some extent, debug.
# Besides, debug info should not affect the performance.
debug = true

View File

@@ -89,6 +89,9 @@ RUN addgroup zenith && adduser -h /data -D -G zenith zenith
VOLUME ["/data"]
WORKDIR /data
USER zenith
ENV ZENITH_REPO_DIR /data/
ENV POSTGRES_DISTRIB_DIR /usr/local
EXPOSE 6400
ENTRYPOINT ["/docker-entrypoint.sh"]
CMD ["pageserver"]

View File

@@ -1,12 +1,3 @@
# Seccomp BPF is only available for Linux
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
SECCOMP = --with-libseccomp
SECCOMP =
else
SECCOMP =
endif
#
# Top level Makefile to build Zenith and PostgreSQL
#
@@ -30,12 +21,8 @@ tmp_install/build/config.status:
+@echo "Configuring postgres build"
mkdir -p tmp_install/build
(cd tmp_install/build && \
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
--enable-cassert \
--enable-debug \
--enable-depend \
$(SECCOMP) \
--prefix=$(abspath tmp_install) > configure.log)
../../vendor/postgres/configure CFLAGS='-O0 $(CFLAGS)' --enable-debug --enable-cassert \
--enable-depend --prefix=$(abspath tmp_install) > configure.log)
# nicer alias for running 'configure'
postgres-configure: tmp_install/build/config.status

View File

@@ -8,7 +8,7 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
On Ubuntu or Debian this set of packages should be sufficient to build the code:
```text
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison \
libssl-dev clang
```

View File

@@ -14,7 +14,6 @@ use std::{
use anyhow::{Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use zenith_utils::connstring::connection_host_port;
use crate::local_env::LocalEnv;
use pageserver::ZTimelineId;
@@ -290,15 +289,14 @@ impl PostgresNode {
// Connect it to the page server.
// Configure that node to take pages from pageserver
let (host, port) = connection_host_port(&self.pageserver.connection_config());
self.append_conf(
"postgresql.conf",
&format!(
"shared_preload_libraries = zenith \n\
zenith.page_server_connstring = 'host={} port={}'\n\
zenith.zenith_timeline='{}'\n",
host,
port,
self.pageserver.address().ip(),
self.pageserver.address().port(),
self.timelineid
),
)?;

View File

@@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::net::{TcpStream};
use std::net::{SocketAddr, TcpStream};
use std::path::PathBuf;
use std::process::Command;
use std::thread;
@@ -8,11 +8,10 @@ use std::time::Duration;
use anyhow::{anyhow, bail, Result};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use postgres::{Config, NoTls};
use postgres::{Client, NoTls};
use crate::local_env::LocalEnv;
use crate::read_pidfile;
use zenith_utils::connstring::connection_address;
use pageserver::branches::BranchInfo;
//
@@ -22,7 +21,7 @@ use pageserver::branches::BranchInfo;
//
pub struct PageServerNode {
pub kill_on_exit: bool,
pub connection_config: Option<Config>,
pub listen_address: Option<SocketAddr>,
pub env: LocalEnv,
}
@@ -30,19 +29,15 @@ impl PageServerNode {
pub fn from_env(env: &LocalEnv) -> PageServerNode {
PageServerNode {
kill_on_exit: false,
connection_config: None, // default
listen_address: None, // default
env: env.clone(),
}
}
fn default_config() -> Config {
"postgresql://no_user@localhost:64000/no_db".parse().unwrap()
}
pub fn connection_config(&self) -> Config {
match &self.connection_config {
Some(config) => config.clone(),
None => Self::default_config(),
pub fn address(&self) -> SocketAddr {
match self.listen_address {
Some(addr) => addr,
None => "127.0.0.1:64000".parse().unwrap(),
}
}
@@ -79,7 +74,7 @@ impl PageServerNode {
pub fn start(&self) -> Result<()> {
println!(
"Starting pageserver at '{}' in {}",
connection_address(&self.connection_config()),
self.address(),
self.repo_path().display()
);
@@ -121,29 +116,42 @@ impl PageServerNode {
}
// wait for pageserver stop
let address = connection_address(&self.connection_config());
for _ in 0..5 {
let stream = TcpStream::connect(&address);
let stream = TcpStream::connect(self.address());
thread::sleep(Duration::from_secs(1));
if let Err(_e) = stream {
println!("Pageserver stopped");
return Ok(());
}
println!("Stopping pageserver on {}", address);
println!("Stopping pageserver on {}", self.address());
}
bail!("Failed to stop pageserver with pid {}", pid);
}
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
let mut client = self.connection_config().connect(NoTls).unwrap();
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address().ip(),
self.address().port(),
"no_db",
"no_user",
);
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
println!("Pageserver query: '{}'", sql);
client.simple_query(sql).unwrap()
}
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
self.connection_config().connect(NoTls)
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address().ip(),
self.address().port(),
"no_db",
"no_user",
);
Client::connect(connstring.as_str(), NoTls)
}
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {

View File

@@ -2,10 +2,10 @@
if [ "$1" = 'pageserver' ]; then
if [ ! -d "/data/timelines" ]; then
echo "Initializing pageserver data directory"
pageserver --init -D /data --postgres-distrib /usr/local
pageserver --init --workdir $ZENITH_REPO_DIR
fi
echo "Staring pageserver at 0.0.0.0:6400"
pageserver -l 0.0.0.0:6400 -D /data
pageserver -l 0.0.0.0:6400 --workdir $ZENITH_REPO_DIR
else
"$@"
fi

View File

@@ -4,8 +4,8 @@
//! TODO: this module has nothing to do with PostgreSQL pg_basebackup.
//! It could use a better name.
//!
//! Stateless Postgres compute node is launched by sending tarball which contains non-relational data (multixacts, clog, filenodemaps, twophase files)
//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directory and
//! Stateless Postgres compute node is lauched by sending taball which contains on-relational data (multixacts, clog, filenodemaps, twophase files)
//! and generate pg_control and dummy segment of WAL. This module is responsible for creation of such tarball from snapshot directry and
//! data stored in object storage.
//!
use crate::ZTimelineId;
@@ -17,15 +17,14 @@ use std::time::SystemTime;
use tar::{Builder, Header};
use walkdir::WalkDir;
use crate::object_key::*;
use crate::repository::Timeline;
use crate::repository::{DatabaseTag, ObjectTag, Timeline};
use postgres_ffi::relfile_utils::*;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::*;
use zenith_utils::lsn::Lsn;
/// This is short-living object only for the time of tarball creation,
/// created mostly to avoid passing a lot of parameters between various functions
/// This is shorliving object only for the time of tarball creation,
/// created mostly to avoid passing a lot of parameters between varyouds functions
/// used for constructing tarball.
pub struct Basebackup<'a> {
ar: Builder<&'a mut dyn Write>,
@@ -56,6 +55,7 @@ impl<'a> Basebackup<'a> {
}
}
#[rustfmt::skip] // otherwise "cargo fmt" produce very strange formatting for macch arms of self.timeline.list_nonrels
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
debug!("sending tarball of snapshot in {}", self.snappath);
for entry in WalkDir::new(&self.snappath) {
@@ -85,8 +85,7 @@ impl<'a> Basebackup<'a> {
trace!("sending {}", relpath.display());
self.ar.append_path_with_name(fullpath, relpath)?;
}
} else {
// relation pages are loaded on demand and should not be included in tarball
} else { // relation pages are loaded on demand and should not be included in tarball
trace!("not sending {}", relpath.display());
}
} else {
@@ -95,25 +94,26 @@ impl<'a> Basebackup<'a> {
}
// Generate non-relational files.
// Iteration is sorted order: all objects of the same time are grouped and traversed
// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
// It allows to easily construct SLRU segments (32 blocks).
// Iteration is sorted order: all objects of the same time are grouped and traversed
// in key ascending order. For example all pg_xact records precede pg_multixact records and are sorted by block number.
// It allows to easily construct SLRU segments (32 blocks).
for obj in self.timeline.list_nonrels(self.lsn)? {
match obj {
ObjectTag::Clog(slru) => self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
ObjectTag::MultiXactMembers(slru) => {
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?
}
ObjectTag::MultiXactOffsets(slru) => {
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?
}
ObjectTag::FileNodeMap(db) => self.add_relmap_file(&obj, &db)?,
ObjectTag::TwoPhase(prepare) => self.add_twophase_file(&obj, prepare.xid)?,
ObjectTag::Clog(slru) =>
self.add_slru_segment("pg_xact", &obj, slru.blknum)?,
ObjectTag::MultiXactMembers(slru) =>
self.add_slru_segment("pg_multixact/members", &obj, slru.blknum)?,
ObjectTag::MultiXactOffsets(slru) =>
self.add_slru_segment("pg_multixact/offsets", &obj, slru.blknum)?,
ObjectTag::FileNodeMap(db) =>
self.add_relmap_file(&obj, &db)?,
ObjectTag::TwoPhase(prepare) =>
self.add_twophase_file(&obj, prepare.xid)?,
_ => {}
}
}
self.finish_slru_segment()?; // write last non-completed SLRU segment (if any)
self.add_pgcontrol_file()?;
self.add_pgcontrol_file()?;
self.ar.finish()?;
debug!("all tarred up!");
Ok(())
@@ -238,9 +238,8 @@ impl<'a> Basebackup<'a> {
info!("pg_control.state = {}", pg_control.state);
pg_control.state = pg_constants::DB_SHUTDOWNED;
// add zenith.signal file
self.ar
.append(&new_tar_header("zenith.signal", 0)?, &b""[..])?;
// add zenith.signal file
self.ar.append(&new_tar_header("zenith.signal", 0)?, &b""[..])?;
//send pg_control
let pg_control_bytes = pg_control.encode();

View File

@@ -8,7 +8,7 @@ use std::{
env,
fs::{File, OpenOptions},
io,
net::TcpListener,
net::{SocketAddr, TcpListener},
path::{Path, PathBuf},
process::exit,
thread,
@@ -65,10 +65,11 @@ impl CfgFileParams {
/// Create a PageServerConf from these string parameters
fn try_into_config(&self) -> Result<PageServerConf> {
let listen_addr = match self.listen_addr.as_ref() {
Some(addr) => addr.clone(),
None => DEFAULT_LISTEN_ADDR.to_owned(),
};
let listen_addr: SocketAddr = self
.listen_addr
.as_deref()
.unwrap_or(DEFAULT_LISTEN_ADDR)
.parse()?;
let gc_horizon: u64 = match self.gc_horizon.as_ref() {
Some(horizon_str) => horizon_str.parse()?,
@@ -271,7 +272,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
// Check that we can bind to address before further initialization
info!("Starting pageserver on {}", conf.listen_addr);
let pageserver_listener = TcpListener::bind(conf.listen_addr.clone())?;
let pageserver_listener = TcpListener::bind(conf.listen_addr)?;
// Initialize page cache, this will spawn walredo_thread
page_cache::init(conf);

View File

@@ -100,8 +100,7 @@ pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
// and we failed to run initdb again in the same directory. This has been solved for the
// rapid init+start case now, but the general race condition remains if you restart the
// server quickly.
//let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
let storage = crate::inmem_storage::InmemObjectStore::create(conf)?;
let storage = crate::rocksdb_storage::RocksObjectStore::create(conf)?;
let repo = crate::object_repository::ObjectRepository::new(
conf,

View File

@@ -1,349 +0,0 @@
//!
//! An implementation of the ObjectStore interface, backed by BTreeMap
//!
use crate::object_key::*;
use crate::object_store::ObjectStore;
use crate::repository::RelTag;
use crate::PageServerConf;
use crate::ZTimelineId;
use anyhow::{bail, Result};
use std::collections::{BTreeMap,HashSet};
use std::sync::RwLock;
use zenith_utils::lsn::Lsn;
use std::ops::Bound::*;
use serde::{Deserialize, Serialize};
use zenith_utils::bin_ser::BeSer;
use std::io::prelude::*;
use std::fs::File;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Deserialize, Serialize)]
pub struct StorageKey {
obj_key: ObjectKey,
lsn: Lsn,
}
impl StorageKey {
/// The first key for a given timeline
fn timeline_start(timeline: ZTimelineId) -> Self {
Self {
obj_key: ObjectKey {
timeline,
tag: ObjectTag::FirstTag,
},
lsn: Lsn(0),
}
}
}
pub struct InmemObjectStore {
conf: &'static PageServerConf,
db: RwLock<BTreeMap<StorageKey, Vec<u8>>>,
}
impl ObjectStore for InmemObjectStore {
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>> {
let db = self.db.read().unwrap();
let val = db.get(&StorageKey {
obj_key: key.clone(),
lsn,
});
if let Some(val) = val {
Ok(val.clone())
} else {
bail!("could not find page {:?}", key);
}
}
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
let search_key = StorageKey {
obj_key: key.clone(),
lsn: Lsn(0),
};
let db = self.db.read().unwrap();
for pair in db.range(&search_key..) {
let key = pair.0;
return Ok(Some(key.obj_key.clone()));
}
Ok(None)
}
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
let mut db = self.db.write().unwrap();
db.insert(
StorageKey {
obj_key: key.clone(),
lsn,
},
value.to_vec(),
);
Ok(())
}
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()> {
let mut db = self.db.write().unwrap();
db.remove(&StorageKey {
obj_key: key.clone(),
lsn,
});
Ok(())
}
/// Iterate through page versions of given page, starting from the given LSN.
/// The versions are walked in descending LSN order.
fn object_versions<'a>(
&'a self,
key: &ObjectKey,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>> {
let from = StorageKey {
obj_key: key.clone(),
lsn: Lsn(0),
};
let till = StorageKey {
obj_key: key.clone(),
lsn,
};
let db = self.db.read().unwrap();
let versions: Vec<(Lsn, Vec<u8>)> = db.range(from..=till).map(|pair|(pair.0.lsn, pair.1.clone())).collect();
Ok(Box::new(InmemObjectVersionIter::new(versions)))
}
/// Iterate through all timeline objects
fn list_objects<'a>(
&'a self,
timeline: ZTimelineId,
nonrel_only: bool,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = ObjectTag> + 'a>> {
let curr_key = StorageKey::timeline_start(timeline);
Ok(Box::new(InmemObjectIter {
store: &self,
curr_key,
timeline,
nonrel_only,
lsn,
}))
}
/// Get a list of all distinct relations in given tablespace and database.
///
/// TODO: This implementation is very inefficient, it scans
/// through all entries in the given database. In practice, this
/// is used for CREATE DATABASE, and usually the template database is small.
/// But if it's not, this will be slow.
fn list_rels(
&self,
timelineid: ZTimelineId,
spcnode: u32,
dbnode: u32,
lsn: Lsn,
) -> Result<HashSet<RelTag>> {
// FIXME: This scans everything. Very slow
let mut rels: HashSet<RelTag> = HashSet::new();
let mut search_rel_tag = RelTag {
spcnode,
dbnode,
relnode: 0,
forknum: 0u8,
};
let db = self.db.read().unwrap();
'outer: loop {
let search_key = StorageKey {
obj_key: ObjectKey {
timeline: timelineid,
tag: ObjectTag::RelationMetadata(search_rel_tag),
},
lsn: Lsn(0),
};
for pair in db.range(&search_key..) {
let key = pair.0;
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
if spcnode != 0 && rel_tag.spcnode != spcnode
|| dbnode != 0 && rel_tag.dbnode != dbnode
{
break 'outer;
}
if key.lsn <= lsn {
// visible in this snapshot
rels.insert(rel_tag);
}
search_rel_tag = rel_tag;
// skip to next relation
// FIXME: What if relnode is u32::MAX ?
search_rel_tag.relnode += 1;
continue 'outer;
} else {
// no more relation metadata entries
break 'outer;
}
}
}
Ok(rels)
}
/// Iterate through versions of all objects in a timeline.
///
/// Returns objects in increasing key-version order.
/// Returns all versions up to and including the specified LSN.
fn objects<'a>(
&'a self,
timeline: ZTimelineId,
lsn: Lsn,
) -> Result<Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>> {
let curr_key = StorageKey::timeline_start(timeline);
Ok(Box::new(InmemObjects {
store: &self,
curr_key,
timeline,
lsn,
}))
}
fn compact(&self) {
}
}
impl Drop for InmemObjectStore {
fn drop(&mut self) {
let path = self.conf.workdir.join("objstore.dmp");
let mut f = File::create(path).unwrap();
f.write(&self.db.ser().unwrap()).unwrap();
}
}
impl InmemObjectStore {
pub fn open(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
let path = conf.workdir.join("objstore.dmp");
let mut f = File::open(path)?;
let mut buffer = Vec::new();
// read the whole file
f.read_to_end(&mut buffer)?;
let db = RwLock::new(BTreeMap::des(&buffer)?);
Ok(InmemObjectStore {
conf: conf,
db
})
}
pub fn create(conf: &'static PageServerConf) -> Result<InmemObjectStore> {
Ok(InmemObjectStore {
conf: conf,
db: RwLock::new(BTreeMap::new()),
})
}
}
///
/// Iterator for `object_versions`. Returns all page versions of a given block, in
/// reverse LSN order.
///
struct InmemObjectVersionIter {
versions: Vec<(Lsn, Vec<u8>)>,
curr: usize,
}
impl InmemObjectVersionIter {
fn new(versions: Vec<(Lsn, Vec<u8>)>) -> InmemObjectVersionIter {
let curr = versions.len();
InmemObjectVersionIter {
versions,
curr
}
}
}
impl Iterator for InmemObjectVersionIter {
type Item = (Lsn, Vec<u8>);
fn next(&mut self) -> std::option::Option<Self::Item> {
if self.curr == 0 {
None
} else {
self.curr -= 1;
Some(self.versions[self.curr].clone())
}
}
}
struct InmemObjects<'r> {
store: &'r InmemObjectStore,
curr_key: StorageKey,
timeline: ZTimelineId,
lsn: Lsn,
}
impl<'r> Iterator for InmemObjects<'r> {
// TODO consider returning Box<[u8]>
type Item = Result<(ObjectTag, Lsn, Vec<u8>)>;
fn next(&mut self) -> Option<Self::Item> {
self.next_result().transpose()
}
}
impl<'r> InmemObjects<'r> {
fn next_result(&mut self) -> Result<Option<(ObjectTag, Lsn, Vec<u8>)>> {
let db = self.store.db.read().unwrap();
for pair in db.range((Excluded(&self.curr_key),Unbounded)) {
let key = pair.0;
if key.obj_key.timeline != self.timeline {
return Ok(None);
}
if key.lsn > self.lsn {
// TODO can speed up by seeking iterator
continue;
}
self.curr_key = key.clone();
let value = pair.1.clone();
return Ok(Some((key.obj_key.tag, key.lsn, value)));
}
Ok(None)
}
}
///
/// Iterator for `list_objects`. Returns all objects preceeding specified LSN
///
struct InmemObjectIter<'a> {
store: &'a InmemObjectStore,
curr_key: StorageKey,
timeline: ZTimelineId,
nonrel_only: bool,
lsn: Lsn,
}
impl<'a> Iterator for InmemObjectIter<'a> {
type Item = ObjectTag;
fn next(&mut self) -> std::option::Option<Self::Item> {
let db = self.store.db.read().unwrap();
'outer: loop {
for pair in db.range((Excluded(&self.curr_key),Unbounded)) {
let key = pair.0;
if key.obj_key.timeline != self.timeline {
return None;
}
self.curr_key = key.clone();
self.curr_key.lsn = Lsn(u64::MAX); // next seek should skip all versions
if key.lsn <= self.lsn {
// visible in this snapshot
if self.nonrel_only {
match key.obj_key.tag {
ObjectTag::RelationMetadata(_) => return None,
ObjectTag::RelationBuffer(_) => return None,
_ => return Some(key.obj_key.tag),
}
} else {
return Some(key.obj_key.tag);
}
}
continue 'outer;
}
return None;
}
}
}

View File

@@ -1,13 +1,13 @@
use serde::{Deserialize, Serialize};
use std::fmt;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
pub mod basebackup;
pub mod branches;
pub mod object_key;
pub mod object_repository;
pub mod object_store;
pub mod page_cache;
@@ -15,7 +15,6 @@ pub mod page_service;
pub mod repository;
pub mod restore_local_repo;
pub mod rocksdb_storage;
pub mod inmem_storage;
pub mod tui;
pub mod tui_event;
mod tui_logger;
@@ -27,7 +26,7 @@ pub mod walredo;
pub struct PageServerConf {
pub daemonize: bool,
pub interactive: bool,
pub listen_addr: String,
pub listen_addr: SocketAddr,
pub gc_horizon: u64,
pub gc_period: Duration,
@@ -104,7 +103,7 @@ impl PageServerConf {
/// is separate from PostgreSQL timelines, and doesn't have those
/// limitations. A zenith timeline is identified by a 128-bit ID, which
/// is usually printed out as a hex string.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ZTimelineId([u8; 16]);
impl FromStr for ZTimelineId {

View File

@@ -1,84 +0,0 @@
use crate::repository::{BufferTag, RelTag};
use crate::waldecoder::TransactionId;
use crate::ZTimelineId;
use serde::{Deserialize, Serialize};
///
/// ObjectKey is the key type used to identify objects stored in an object
/// repository. It is shared between object_repository.rs and object_store.rs.
/// It is mostly opaque to ObjectStore, it just stores and retrieves objects
/// using the key given by the caller.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct ObjectKey {
pub timeline: ZTimelineId,
pub tag: ObjectTag,
}
///
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
///
/// These files are global for a postgres instance.
///
/// These files are divided into segments, which are divided into pages
/// of the same BLCKSZ as used for relation files.
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct SlruBufferTag {
pub blknum: u32,
}
///
/// Special type of Postgres files: pg_filenode.map is needed to map
/// catalog table OIDs to filenode numbers, which define filename.
///
/// Each database has a map file for its local mapped catalogs,
/// and there is a separate map file for shared catalogs.
///
/// These files have untypical size of 512 bytes.
///
/// See PostgreSQL relmapper.c for details.
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct DatabaseTag {
pub spcnode: u32,
pub dbnode: u32,
}
///
/// Non-relation files that keep state for prepared transactions.
/// Unlike other files these are not divided into pages.
///
/// See PostgreSQL twophase.c for details.
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct PrepareTag {
pub xid: TransactionId,
}
/// ObjectTag is a part of ObjectKey that is specific to the type of
/// the stored object.
///
/// NB: the order of the enum values is significant! In particular,
/// rocksdb_storage.rs assumes that TimelineMetadataTag is first
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum ObjectTag {
// dummy tag preceeding all other keys
FirstTag,
TimelineMetadataTag,
// Special entry that represents PostgreSQL checkpoint.
// We use it to track fields needed to restore controlfile checkpoint.
Checkpoint,
// Various types of non-relation files.
// We need them to bootstrap compute node.
ControlFile,
Clog(SlruBufferTag),
MultiXactMembers(SlruBufferTag),
MultiXactOffsets(SlruBufferTag),
FileNodeMap(DatabaseTag),
TwoPhase(PrepareTag),
// put relations at the end of enum to allow efficient iterations through non-rel objects
RelationMetadata(RelTag),
RelationBuffer(BufferTag),
}

View File

@@ -13,8 +13,7 @@
//! until we find the page we're looking for, making a separate lookup into the
//! key-value store for each timeline.
use crate::object_key::*;
use crate::object_store::ObjectStore;
use crate::object_store::{ObjectKey, ObjectStore};
use crate::repository::*;
use crate::restore_local_repo::import_timeline_wal;
use crate::walredo::WalRedoManager;
@@ -106,7 +105,7 @@ impl Repository for ObjectRepository {
) -> Result<Arc<dyn Timeline>> {
let mut timelines = self.timelines.lock().unwrap();
// Write initial metadata key.
// Write metadata key
let metadata = MetadataEntry {
last_valid_lsn: start_lsn,
last_record_lsn: start_lsn,
@@ -162,7 +161,7 @@ impl Repository for ObjectRepository {
ObjectTag::TimelineMetadataTag => {} // skip it
_ => {
let img = src_timeline.get_page_at_lsn_nowait(tag, at_lsn)?;
let val = ObjectValue::Page(PageEntry::Page(img));
let val = ObjectValue::Page(img);
let key = ObjectKey { timeline: dst, tag };
self.obj_store.put(&key, at_lsn, &ObjectValue::ser(&val)?)?;
}
@@ -236,19 +235,22 @@ impl ObjectTimeline {
let v = obj_store
.get(&timeline_metadata_key(timelineid), Lsn(0))
.with_context(|| "timeline not found in repository")?;
let metadata = ObjectValue::des_timeline_metadata(&v)?;
let timeline = ObjectTimeline {
timelineid,
obj_store,
walredo_mgr,
last_valid_lsn: SeqWait::new(metadata.last_valid_lsn),
last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0),
ancestor_timeline: metadata.ancestor_timeline,
ancestor_lsn: metadata.ancestor_lsn,
rel_meta: RwLock::new(BTreeMap::new()),
};
Ok(timeline)
if let ObjectValue::TimelineMetadata(metadata) = ObjectValue::des(&v)? {
let timeline = ObjectTimeline {
timelineid,
obj_store,
walredo_mgr,
last_valid_lsn: SeqWait::new(metadata.last_valid_lsn),
last_record_lsn: AtomicLsn::new(metadata.last_record_lsn.0),
ancestor_timeline: metadata.ancestor_timeline,
ancestor_lsn: metadata.ancestor_lsn,
rel_meta: RwLock::new(BTreeMap::new()),
};
Ok(timeline)
} else {
bail!("Invalid timeline metadata");
}
}
}
@@ -278,22 +280,17 @@ impl Timeline for ObjectTimeline {
let page_img: Bytes;
match ObjectValue::des(&value)? {
ObjectValue::Page(PageEntry::Page(img)) => {
ObjectValue::Page(img) => {
page_img = img;
}
ObjectValue::Page(PageEntry::WALRecord(_rec)) => {
ObjectValue::WALRecord(_rec) => {
// Request the WAL redo manager to apply the WAL records for us.
let (base_img, records) = self.collect_records_for_apply(tag, lsn)?;
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
// Garbage collection assumes that we remember the materialized page
// version. Otherwise we could opt to not do it, with the downside that
// the next GetPage@LSN call of the same page version would have to
// redo the WAL again.
self.put_page_image(tag, lsn, page_img.clone(), false)?;
self.put_page_image(tag, lsn, page_img.clone())?;
}
ObjectValue::SLRUTruncate => page_img = Bytes::from_static(&ZERO_PAGE),
_ => bail!("Invalid object kind, expected a page entry or SRLU truncate"),
x => bail!("Unexpected object value: {:?}", x),
}
// FIXME: assumes little-endian. Only used for the debugging log though
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
@@ -373,10 +370,12 @@ impl Timeline for ObjectTimeline {
.obj_store
.get(&timeline_metadata_key(timeline), Lsn(0))
.with_context(|| "timeline not found in repository")?;
let metadata = ObjectValue::des_timeline_metadata(&v)?;
prev_timeline = metadata.ancestor_timeline;
lsn = metadata.ancestor_lsn;
if let ObjectValue::TimelineMetadata(metadata) = ObjectValue::des(&v)? {
prev_timeline = metadata.ancestor_timeline;
lsn = metadata.ancestor_lsn;
} else {
bail!("Invalid timeline metadata");
}
}
Ok(all_rels)
@@ -394,8 +393,13 @@ impl Timeline for ObjectTimeline {
/// current end-of-file.
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()> {
let lsn = rec.lsn;
let key = ObjectKey {
timeline: self.timelineid,
tag,
};
let val = ObjectValue::WALRecord(rec);
self.put_page_entry(&tag, lsn, PageEntry::WALRecord(rec))?;
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
debug!("put_wal_record {:?} at {}", tag, lsn);
if let ObjectTag::RelationBuffer(tag) = tag {
@@ -404,6 +408,8 @@ impl Timeline for ObjectTimeline {
if tag.blknum >= old_nblocks {
let new_nblocks = tag.blknum + 1;
let key = relation_size_key(self.timelineid, tag.rel);
let val = ObjectValue::RelationSize(new_nblocks);
trace!(
"Extended relation {} from {} to {} blocks at {}",
@@ -413,7 +419,7 @@ impl Timeline for ObjectTimeline {
lsn
);
self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
let mut rel_meta = self.rel_meta.write().unwrap();
rel_meta.insert(
tag.rel,
@@ -427,36 +433,18 @@ impl Timeline for ObjectTimeline {
Ok(())
}
/// Unlink relation. This method is used for marking dropped relations.
fn put_unlink(&self, rel_tag: RelTag, lsn: Lsn) -> Result<()> {
self.put_relsize_entry(&rel_tag, lsn, RelationSizeEntry::Unlink)?;
Ok(())
}
/// Truncate SRLU segment
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
/// Unlink object. This method is used for marking dropped relations
/// and removed segments of SLRUs.
fn put_unlink(&self, tag: ObjectTag, lsn: Lsn) -> Result<()> {
let key = ObjectKey {
timeline: self.timelineid,
tag,
};
let val = ObjectValue::SLRUTruncate;
let val = ObjectValue::Unlink;
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
Ok(())
}
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>> {
let key = ObjectKey {
timeline: self.timelineid,
tag,
};
if let Some(key) = self.obj_store.get_next_key(&key)? {
Ok(Some(key.tag))
} else {
Ok(None)
}
}
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()> {
let key = ObjectKey {
timeline: self.timelineid,
@@ -469,13 +457,16 @@ impl Timeline for ObjectTimeline {
///
/// Memorize a full image of a page version
///
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool) -> Result<()> {
self.put_page_entry(&tag, lsn, PageEntry::Page(img))?;
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes) -> Result<()> {
let key = ObjectKey {
timeline: self.timelineid,
tag,
};
let val = ObjectValue::Page(img);
if !update_meta {
return Ok(());
}
debug!("put_page_image rel {:?} at {}", tag, lsn);
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
debug!("put_page_image {:?} at {}", tag, lsn);
if let ObjectTag::RelationBuffer(tag) = tag {
// Also check if this created or extended the file
@@ -483,7 +474,8 @@ impl Timeline for ObjectTimeline {
if tag.blknum >= old_nblocks {
let new_nblocks = tag.blknum + 1;
let key = relation_size_key(self.timelineid, tag.rel);
let val = ObjectValue::RelationSize(new_nblocks);
trace!(
"Extended relation {} from {} to {} blocks at {}",
tag.rel,
@@ -492,7 +484,7 @@ impl Timeline for ObjectTimeline {
lsn
);
self.put_relsize_entry(&tag.rel, lsn, RelationSizeEntry::Size(new_nblocks))?;
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
let mut rel_meta = self.rel_meta.write().unwrap();
rel_meta.insert(
tag.rel,
@@ -511,9 +503,10 @@ impl Timeline for ObjectTimeline {
/// associating it with all pages started with specified block number
///
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
info!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
let key = relation_size_key(self.timelineid, rel);
let val = ObjectValue::RelationSize(nblocks);
self.put_relsize_entry(&rel, lsn, RelationSizeEntry::Size(nblocks))?;
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)?;
let mut rel_meta = self.rel_meta.write().unwrap();
rel_meta.insert(
rel,
@@ -598,7 +591,12 @@ impl Timeline for ObjectTimeline {
};
trace!("checkpoint at {}", metadata.last_valid_lsn);
self.put_timeline_metadata_entry(metadata)?;
let val = ObjectValue::TimelineMetadata(metadata);
self.obj_store.put(
&timeline_metadata_key(self.timelineid),
Lsn(0),
&ObjectValue::ser(&val)?,
)?;
Ok(())
}
@@ -606,171 +604,7 @@ impl Timeline for ObjectTimeline {
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>> {
let lsn = self.last_valid_lsn.load();
let iter = self.obj_store.objects(self.timelineid, lsn)?;
Ok(Box::new(ObjectHistory {
lsn,
iter,
last_relation_size: None,
}))
}
fn gc_iteration(&self, horizon: u64) -> Result<GcResult> {
let last_lsn = self.get_last_valid_lsn();
let mut result: GcResult = Default::default();
// checked_sub() returns None on overflow.
if let Some(horizon) = last_lsn.checked_sub(horizon) {
// WAL is large enough to perform GC
let now = Instant::now();
// Iterate through all objects in timeline
for obj in self
.obj_store
.list_objects(self.timelineid, false, last_lsn)?
{
result.inspected += 1;
match obj {
// Prepared transactions
ObjectTag::TwoPhase(prepare) => {
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
if self.get_tx_status(prepare.xid, horizon)?
!= pg_constants::TRANSACTION_STATUS_IN_PROGRESS
{
let lsn = vers.0;
self.obj_store.unlink(&key, lsn)?;
result.prep_deleted += 1;
}
}
}
ObjectTag::RelationMetadata(_) => {
// Do not need to reconstruct page images,
// just delete all old versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
let content = vers.1;
match ObjectValue::des(&content[..])? {
ObjectValue::RelationSize(RelationSizeEntry::Unlink) => {
self.obj_store.unlink(&key, lsn)?;
result.deleted += 1;
result.dropped += 1;
}
_ => (), // preserve last version
}
last_version = false;
result.truncated += 1;
result.n_relations += 1;
} else {
self.obj_store.unlink(&key, lsn)?;
result.deleted += 1;
}
}
}
ObjectTag::RelationBuffer(tag) => {
// Reconstruct page at horizon unless relation was dropped
// and delete all older versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
result.truncated += 1;
last_version = false;
if let Some(rel_size) = self.relsize_get_nowait(tag.rel, last_lsn)? {
if rel_size > tag.blknum {
// preserve and materialize last version before deleting all preceeding
self.get_page_at_lsn_nowait(obj, lsn)?;
continue;
}
debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
} else {
if let Some(rel_size) =
self.relsize_get_nowait(tag.rel, last_lsn)?
{
debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
continue;
}
debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
}
// relation was dropped or truncated so this block can be removed
}
self.obj_store.unlink(&key, lsn)?;
result.deleted += 1;
}
}
// SLRU-s
ObjectTag::Clog(_)
| ObjectTag::MultiXactOffsets(_)
| ObjectTag::MultiXactMembers(_) => {
// Remove old versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
let content = vers.1;
match ObjectValue::des(&content[..])? {
ObjectValue::SLRUTruncate => {
self.obj_store.unlink(&key, lsn)?;
result.slru_deleted += 1;
}
ObjectValue::Page(PageEntry::WALRecord(_)) => {
// preserve and materialize last version before deleting all preceeding
self.get_page_at_lsn_nowait(obj, lsn)?;
}
_ => {} // do nothing if already materialized
}
last_version = false;
} else {
// delete deteriorated version
self.obj_store.unlink(&key, lsn)?;
result.slru_deleted += 1;
}
}
}
// versioned always materialized objects: no need to reconstruct pages
ObjectTag::Checkpoint | ObjectTag::ControlFile => {
// Remove old versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
// preserve last version
last_version = false;
} else {
// delete deteriorated version
self.obj_store.unlink(&key, lsn)?;
result.chkp_deleted += 1;
}
}
}
_ => (), // do nothing
}
}
result.elapsed = now.elapsed();
info!("Garbage collection completed in {:?}: {} relations inspected, {} object inspected, {} version histories truncated, {} versions deleted, {} relations dropped",
result.elapsed, result.n_relations, result.inspected, result.truncated, result.deleted, result.dropped);
self.obj_store.compact();
}
Ok(result)
Ok(Box::new(ObjectHistory { lsn, iter }))
}
}
@@ -793,8 +627,9 @@ impl ObjectTimeline {
let mut iter = self.object_versions(&*self.obj_store, &key, lsn)?;
if let Some((version_lsn, value)) = iter.next().transpose()? {
match ObjectValue::des_relsize(&value)? {
RelationSizeEntry::Size(nblocks) => {
let value = ObjectValue::des(&value)?;
match value {
ObjectValue::RelationSize(nblocks) => {
trace!(
"relation {} has size {} at {} (request {})",
rel,
@@ -804,7 +639,7 @@ impl ObjectTimeline {
);
Ok(Some(nblocks))
}
RelationSizeEntry::Unlink => {
ObjectValue::Unlink => {
trace!(
"relation {} not found; it was dropped at lsn {}",
rel,
@@ -812,9 +647,15 @@ impl ObjectTimeline {
);
Ok(None)
}
_ => bail!(
"Unexpect relation {} size value {:?} at {}",
rel,
value,
lsn
),
}
} else {
info!("relation {} not found at {}", rel, lsn);
debug!("relation {} not found at {}", rel, lsn);
Ok(None)
}
}
@@ -842,20 +683,21 @@ impl ObjectTimeline {
};
let mut iter = self.object_versions(&*self.obj_store, &searchkey, lsn)?;
while let Some((_key, value)) = iter.next().transpose()? {
match ObjectValue::des_page(&value)? {
PageEntry::Page(img) => {
match ObjectValue::des(&value)? {
ObjectValue::Page(img) => {
// We have a base image. No need to dig deeper into the list of
// records
base_img = Some(img);
break;
}
PageEntry::WALRecord(rec) => {
ObjectValue::WALRecord(rec) => {
records.push(rec.clone());
// If this WAL record initializes the page, no need to dig deeper.
if rec.will_init {
break;
}
}
x => bail!("Unexpected object value {:?}", x),
}
}
records.reverse();
@@ -867,15 +709,170 @@ impl ObjectTimeline {
.name("Garbage collection thread".into())
.spawn(move || {
// FIXME
timeline_rc.gc_loop(conf).expect("GC thread died");
timeline_rc.do_gc(conf).expect("GC thread died");
})
.unwrap();
}
fn gc_loop(&self, conf: &'static PageServerConf) -> Result<()> {
fn do_gc(&self, conf: &'static PageServerConf) -> Result<()> {
loop {
thread::sleep(conf.gc_period);
self.gc_iteration(conf.gc_horizon)?;
let last_lsn = self.get_last_valid_lsn();
// checked_sub() returns None on overflow.
if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
// WAL is large enough to perform GC
let now = Instant::now();
let mut truncated = 0u64;
let mut inspected = 0u64;
let mut deleted = 0u64;
// Iterate through all objects in timeline
for obj in self
.obj_store
.list_objects(self.timelineid, false, last_lsn)?
{
inspected += 1;
match obj {
// Prepared transactions
ObjectTag::TwoPhase(prepare) => {
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
if self.get_tx_status(prepare.xid, horizon)?
!= pg_constants::TRANSACTION_STATUS_IN_PROGRESS
{
let lsn = vers.0;
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
}
}
ObjectTag::RelationMetadata(_) => {
// Do not need to reconstruct page images,
// just delete all old versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
let content = vers.1;
match ObjectValue::des(&content[..])? {
ObjectValue::Unlink => {
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
_ => (), // preserve last version
}
last_version = false;
truncated += 1;
} else {
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
}
}
ObjectTag::RelationBuffer(tag) => {
// Reconstruct page at horizon unless relation was dropped
// and delete all older versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
truncated += 1;
last_version = false;
if let Some(rel_size) = self.relsize_get_nowait(tag.rel, lsn)? {
if rel_size > tag.blknum {
// preserve and materialize last version before deleting all preceeding
self.get_page_at_lsn_nowait(obj, lsn)?;
continue;
}
debug!("Drop last block {} of relation {:?} at {} because it is beyond relation size {}", tag.blknum, tag.rel, lsn, rel_size);
} else {
if let Some(rel_size) =
self.relsize_get_nowait(tag.rel, last_lsn)?
{
debug!("Preserve block {} of relation {:?} at {} because relation has size {} at {}", tag.rel, tag, lsn, rel_size, last_lsn);
continue;
}
debug!("Relation {:?} was dropped at {}", tag.rel, lsn);
}
// relation was dropped or truncated so this block can be removed
}
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
}
// SLRU-s
ObjectTag::Clog(_)
| ObjectTag::MultiXactOffsets(_)
| ObjectTag::MultiXactMembers(_) => {
// Remove old versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
let content = vers.1;
match ObjectValue::des(&content[..])? {
ObjectValue::Unlink => {
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
ObjectValue::WALRecord(_) => {
// preserve and materialize last version before deleting all preceeding
self.get_page_at_lsn_nowait(obj, lsn)?;
}
_ => {} // do nothing if already materialized
}
last_version = false;
truncated += 1;
} else {
// delete deteriorated version
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
}
}
// versioned alwaysmaterialized objects: no need to reconstruct pages
ObjectTag::Checkpoint | ObjectTag::ControlFile => {
// Remove old versions over horizon
let mut last_version = true;
let key = ObjectKey {
timeline: self.timelineid,
tag: obj,
};
for vers in self.obj_store.object_versions(&key, horizon)? {
let lsn = vers.0;
if last_version {
// presrve last version
last_version = false;
truncated += 1;
} else {
// delete deteriorated version
self.obj_store.unlink(&key, lsn)?;
deleted += 1;
}
}
}
_ => (), // do nothing
}
}
info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} version histories truncated, {} versions deleted",
now.elapsed(), inspected, truncated, deleted);
}
}
}
@@ -930,52 +927,26 @@ impl ObjectTimeline {
Ok(ObjectVersionIter {
obj_store,
object_tag: key.tag,
tag: key.tag,
current_iter,
ancestor_timeline: self.ancestor_timeline,
ancestor_lsn: self.ancestor_lsn,
})
}
//
// Helper functions to store different kinds of objects to the underlying ObjectStore
//
fn put_page_entry(&self, tag: &ObjectTag, lsn: Lsn, val: PageEntry) -> Result<()> {
let key = ObjectKey {
timeline: self.timelineid,
tag: *tag,
};
let val = ObjectValue::Page(val);
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
}
fn put_relsize_entry(&self, tag: &RelTag, lsn: Lsn, val: RelationSizeEntry) -> Result<()> {
let key = relation_size_key(self.timelineid, *tag);
let val = ObjectValue::RelationSize(val);
self.obj_store.put(&key, lsn, &ObjectValue::ser(&val)?)
}
fn put_timeline_metadata_entry(&self, val: MetadataEntry) -> Result<()> {
let key = timeline_metadata_key(self.timelineid);
let val = ObjectValue::TimelineMetadata(val);
self.obj_store.put(&key, Lsn(0), &ObjectValue::ser(&val)?)
}
}
struct ObjectHistory<'a> {
iter: Box<dyn Iterator<Item = Result<(ObjectTag, Lsn, Vec<u8>)>> + 'a>,
lsn: Lsn,
last_relation_size: Option<(RelTag, u32)>,
}
impl<'a> Iterator for ObjectHistory<'a> {
type Item = Result<RelationUpdate>;
type Item = Result<Modification>;
fn next(&mut self) -> Option<Self::Item> {
self.next_result().transpose()
self.iter
.next()
.map(|result| result.map(|t| Modification::new(t)))
}
}
@@ -985,117 +956,29 @@ impl<'a> History for ObjectHistory<'a> {
}
}
impl<'a> ObjectHistory<'a> {
fn handle_relation_size(
&mut self,
rel_tag: RelTag,
entry: RelationSizeEntry,
) -> Option<Update> {
match entry {
RelationSizeEntry::Size(size) => {
// we only want to output truncations, expansions are filtered out
let last_relation_size = self.last_relation_size.replace((rel_tag, size));
match last_relation_size {
Some((last_buf, last_size)) if last_buf != rel_tag || size < last_size => {
Some(Update::Truncate { n_blocks: size })
}
_ => None,
}
}
RelationSizeEntry::Unlink => Some(Update::Unlink),
}
}
fn handle_page(&mut self, buf_tag: BufferTag, entry: PageEntry) -> Update {
match entry {
PageEntry::Page(img) => Update::Page {
blknum: buf_tag.blknum,
img,
},
PageEntry::WALRecord(rec) => Update::WALRecord {
blknum: buf_tag.blknum,
rec,
},
}
}
fn next_result(&mut self) -> Result<Option<RelationUpdate>> {
while let Some((object_tag, lsn, value)) = self.iter.next().transpose()? {
let (rel_tag, update) = match object_tag {
ObjectTag::RelationMetadata(rel_tag) => {
let entry = ObjectValue::des_relsize(&value)?;
match self.handle_relation_size(rel_tag, entry) {
Some(relation_update) => (rel_tag, relation_update),
None => continue,
}
}
ObjectTag::RelationBuffer(buf_tag) => {
let entry = ObjectValue::des_page(&value)?;
let update = self.handle_page(buf_tag, entry);
(buf_tag.rel, update)
}
_ => continue,
};
return Ok(Some(RelationUpdate {
rel: rel_tag,
lsn,
update,
}));
}
Ok(None)
}
}
///
/// We store several kinds of objects in the repository.
/// We have per-page, per-relation and per-timeline entries.
/// We have per-page, per-relation(or non-rel file) and per-timeline entries.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
enum ObjectValue {
Page(PageEntry),
RelationSize(RelationSizeEntry),
TimelineMetadata(MetadataEntry),
SLRUTruncate,
}
///
/// This is what we store for each page in the object store. Use
/// ObjectTag::RelationBuffer as key.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
enum PageEntry {
/// Ready-made image of the block
pub enum ObjectValue {
/// Ready-made images of the block
Page(Bytes),
/// WAL record, to be applied on top of the "previous" entry
/// WAL records, to be applied on top of the "previous" entry
///
/// Some WAL records will initialize the page from scratch. For such records,
/// the 'will_init' flag is set. They don't need the previous page image before
/// applying. The 'will_init' flag is set for records containing a full-page image,
/// and for records with the BKPBLOCK_WILL_INIT flag. These differ from Page images
/// stored directly in the repository in that you still need to run the WAL redo
/// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
/// stored directly in the cache entry in that you still need to run the WAL redo
/// routine to generate the page image.
WALRecord(WALRecord),
}
///
/// In addition to page versions, we store relation size as a separate, versioned,
/// object. That way we can answer nblocks requests faster, and we also use it to
/// support relation truncation without having to add a tombstone page version for
/// each block that is truncated away.
///
/// Use ObjectTag::RelationMetadata as the key.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
enum RelationSizeEntry {
Size(u32),
/// RelationSize. We store it separately not only to ansver nblocks requests faster.
/// We also need it to support relation truncation.
RelationSize(u32),
/// Tombstone for a dropped relation.
Unlink,
TimelineMetadata(MetadataEntry),
}
const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
@@ -1106,9 +989,8 @@ const fn relation_size_key(timelineid: ZTimelineId, rel: RelTag) -> ObjectKey {
}
///
/// In addition to the per-page and per-relation entries, we also store
/// a little metadata blob for each timeline. This is not versioned, use
/// ObjectTag::TimelineMetadataTag with constant Lsn(0) as the key.
/// In addition to those per-page and per-relation entries, we also
/// store a little metadata blob for each timeline.
///
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetadataEntry {
@@ -1125,44 +1007,6 @@ const fn timeline_metadata_key(timelineid: ZTimelineId) -> ObjectKey {
}
}
///
/// Helper functions to deserialize ObjectValue, when the caller knows what kind of
/// a value it should be.
///
/// There are no matching helper functions for serializing. Instead, there are
/// `put_page_entry`, `put_relsize_entry`, and `put_timeline_metadata_entry` helper
/// functions in ObjectTimeline that both construct the right kind of key and
/// serialize the value in the same call.
///
impl ObjectValue {
fn des_page(v: &[u8]) -> Result<PageEntry> {
match ObjectValue::des(&v)? {
ObjectValue::Page(p) => Ok(p),
_ => {
bail!("Invalid object kind, expected a page entry");
}
}
}
fn des_relsize(v: &[u8]) -> Result<RelationSizeEntry> {
match ObjectValue::des(&v)? {
ObjectValue::RelationSize(rs) => Ok(rs),
_ => {
bail!("Invalid object kind, expected a relation size entry");
}
}
}
fn des_timeline_metadata(v: &[u8]) -> Result<MetadataEntry> {
match ObjectValue::des(&v)? {
ObjectValue::TimelineMetadata(t) => Ok(t),
_ => {
bail!("Invalid object kind, expected a timeline metadata entry");
}
}
}
}
///
/// Iterator for `object_versions`. Returns all page versions of a given block, in
/// reverse LSN order. This implements the traversal of ancestor timelines. If
@@ -1172,7 +1016,7 @@ impl ObjectValue {
struct ObjectVersionIter<'a> {
obj_store: &'a dyn ObjectStore,
object_tag: ObjectTag,
tag: ObjectTag,
/// Iterator on the current timeline.
current_iter: Box<dyn Iterator<Item = (Lsn, Vec<u8>)> + 'a>,
@@ -1209,7 +1053,7 @@ impl<'a> ObjectVersionIter<'a> {
if let Some(ancestor_timeline) = self.ancestor_timeline {
let searchkey = ObjectKey {
timeline: ancestor_timeline,
tag: self.object_tag,
tag: self.tag,
};
let ancestor_iter = self
.obj_store
@@ -1222,10 +1066,13 @@ impl<'a> ObjectVersionIter<'a> {
.obj_store
.get(&timeline_metadata_key(ancestor_timeline), Lsn(0))
.with_context(|| "timeline not found in repository")?;
let ancestor_metadata = ObjectValue::des_timeline_metadata(&v)?;
self.ancestor_timeline = ancestor_metadata.ancestor_timeline;
self.ancestor_lsn = ancestor_metadata.ancestor_lsn;
self.current_iter = ancestor_iter;
if let ObjectValue::TimelineMetadata(ancestor_metadata) = ObjectValue::des(&v)? {
self.ancestor_timeline = ancestor_metadata.ancestor_timeline;
self.ancestor_lsn = ancestor_metadata.ancestor_lsn;
self.current_iter = ancestor_iter;
} else {
bail!("Invalid timeline metadata");
}
} else {
return Ok(None);
}

View File

@@ -1,13 +1,19 @@
//! Low-level key-value storage abstraction.
//!
use crate::object_key::*;
use crate::repository::RelTag;
use crate::repository::{ObjectTag, RelTag};
use crate::ZTimelineId;
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::iter::Iterator;
use zenith_utils::lsn::Lsn;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ObjectKey {
pub timeline: ZTimelineId,
pub tag: ObjectTag,
}
///
/// Low-level storage abstraction.
///
@@ -34,9 +40,6 @@ pub trait ObjectStore: Send + Sync {
/// correspond to any real relation.
fn get(&self, key: &ObjectKey, lsn: Lsn) -> Result<Vec<u8>>;
/// Read key greater or equal than specified
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>>;
/// Iterate through all page versions of one object.
///
/// Returns all page versions in descending LSN order, along with the LSN
@@ -82,7 +85,4 @@ pub trait ObjectStore: Send + Sync {
/// Unlink object (used by GC). This mehod may actually delete object or just mark it for deletion.
fn unlink(&self, key: &ObjectKey, lsn: Lsn) -> Result<()>;
// Compact storage and remove versions marged for deletion
fn compact(&self);
}

View File

@@ -5,8 +5,7 @@
use crate::object_repository::ObjectRepository;
use crate::repository::Repository;
//use crate::rocksdb_storage::RocksObjectStore;
use crate::inmem_storage::InmemObjectStore;
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::PostgresRedoManager;
use crate::PageServerConf;
use lazy_static::lazy_static;
@@ -19,8 +18,7 @@ lazy_static! {
pub fn init(conf: &'static PageServerConf) {
let mut m = REPOSITORY.lock().unwrap();
//let obj_store = RocksObjectStore::open(conf).unwrap();
let obj_store = InmemObjectStore::open(conf).unwrap();
let obj_store = RocksObjectStore::open(conf).unwrap();
// Set up a WAL redo manager, for applying WAL records.
let walredo_mgr = PostgresRedoManager::new(conf);

View File

@@ -21,16 +21,13 @@ use std::thread;
use std::{io, net::TcpStream};
use zenith_utils::postgres_backend;
use zenith_utils::postgres_backend::PostgresBackend;
use zenith_utils::pq_proto::{
BeMessage, FeMessage, RowDescriptor, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC,
};
use zenith_utils::pq_proto::{BeMessage, FeMessage, HELLO_WORLD_ROW, SINGLE_COL_ROWDESC};
use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
use crate::basebackup;
use crate::branches;
use crate::object_key::ObjectTag;
use crate::page_cache;
use crate::repository::{BufferTag, RelTag, RelationUpdate, Update};
use crate::repository::{BufferTag, Modification, ObjectTag, RelTag};
use crate::restore_local_repo;
use crate::walreceiver;
use crate::PageServerConf;
@@ -413,38 +410,14 @@ impl postgres_backend::Handler for PageServerHandler {
while let Some(msg) = pgb.read_message()? {
match msg {
FeMessage::CopyData(bytes) => {
let relation_update = RelationUpdate::des(&bytes)?;
let modification = Modification::des(&bytes)?;
last_lsn = relation_update.lsn;
match relation_update.update {
Update::Page { blknum, img } => {
let tag = ObjectTag::RelationBuffer(BufferTag {
rel: relation_update.rel,
blknum,
});
timeline.put_page_image(tag, relation_update.lsn, img, true)?;
}
Update::WALRecord { blknum, rec } => {
let tag = ObjectTag::RelationBuffer(BufferTag {
rel: relation_update.rel,
blknum,
});
timeline.put_wal_record(tag, rec)?;
}
Update::Truncate { n_blocks } => {
timeline.put_truncation(
relation_update.rel,
relation_update.lsn,
n_blocks,
)?;
}
Update::Unlink => {
todo!()
}
}
last_lsn = modification.lsn;
timeline.put_raw_data(
modification.tag,
last_lsn,
&modification.data[..],
)?;
}
FeMessage::CopyDone => {
timeline.advance_last_valid_lsn(last_lsn);
@@ -509,97 +482,11 @@ impl postgres_backend::Handler for PageServerHandler {
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?;
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(system_id.as_bytes())]))?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with(b"do_gc ") {
// Run GC immediately on given timeline.
// FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
// This probably should require special authentication or a global flag to
// enable, I don't think we want to or need to allow regular clients to invoke
// GC.
let query_str = std::str::from_utf8(&query_string)?;
let mut it = query_str.split(' ');
it.next().unwrap();
let timeline_id: ZTimelineId = it
.next()
.ok_or_else(|| anyhow!("missing timeline id"))?
.parse()?;
let timeline = page_cache::get_repository().get_timeline(timeline_id)?;
let horizon: u64 = it
.next()
.unwrap_or(&self.conf.gc_horizon.to_string())
.parse()?;
let result = timeline.gc_iteration(horizon)?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor {
name: b"n_relations",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"truncated",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"deleted",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"prep_deleted",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"slru_deleted",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"chkp_deleted",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"dropped",
typoid: 20,
typlen: 8,
..Default::default()
},
RowDescriptor {
name: b"elapsed",
typoid: 20,
typlen: 8,
..Default::default()
},
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(&result.n_relations.to_string().as_bytes()),
Some(&result.truncated.to_string().as_bytes()),
Some(&result.deleted.to_string().as_bytes()),
Some(&result.prep_deleted.to_string().as_bytes()),
Some(&result.slru_deleted.to_string().as_bytes()),
Some(&result.chkp_deleted.to_string().as_bytes()),
Some(&result.dropped.to_string().as_bytes()),
Some(&result.elapsed.as_millis().to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {
bail!("unknown command");
}
pgb.flush()?;
Ok(())
}
}

View File

@@ -1,4 +1,3 @@
use crate::object_key::*;
use crate::waldecoder::TransactionId;
use crate::ZTimelineId;
use anyhow::Result;
@@ -11,7 +10,6 @@ use std::collections::HashSet;
use std::fmt;
use std::iter::Iterator;
use std::sync::Arc;
use std::time::Duration;
use zenith_utils::lsn::Lsn;
///
@@ -34,22 +32,6 @@ pub trait Repository: Send + Sync {
//fn get_stats(&self) -> RepositoryStats;
}
///
/// Result of performing GC
///
#[derive(Default)]
pub struct GcResult {
pub n_relations: u64,
pub inspected: u64,
pub truncated: u64,
pub deleted: u64,
pub prep_deleted: u64, // 2PC prepare
pub slru_deleted: u64, // SLRU (clog, multixact)
pub chkp_deleted: u64, // Checkpoints
pub dropped: u64,
pub elapsed: Duration,
}
pub trait Timeline: Send + Sync {
//------------------------------------------------------------------------------
// Public GET functions
@@ -67,7 +49,7 @@ pub trait Timeline: Send + Sync {
/// Does relation exist?
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
/// Get a list of all distinct relations in given tablespace and database.
/// Get a list of all relations in given tablespace and database.
fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result<HashSet<RelTag>>;
/// Get a list of non-relational objects
@@ -79,29 +61,24 @@ pub trait Timeline: Send + Sync {
// These are called by the WAL receiver to digest WAL records.
//------------------------------------------------------------------------------
/// Put raw data
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
/// Put a new page version that can be constructed from a WAL record
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
fn put_wal_record(&self, tag: ObjectTag, rec: WALRecord) -> Result<()>;
/// Put raw data
fn put_raw_data(&self, tag: ObjectTag, lsn: Lsn, data: &[u8]) -> Result<()>;
/// Like put_wal_record, but with ready-made image of the page.
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes, update_meta: bool) -> Result<()>;
fn put_page_image(&self, tag: ObjectTag, lsn: Lsn, img: Bytes) -> Result<()>;
/// Truncate relation
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
/// Unlink relation. This method is used for marking dropped relations.
fn put_unlink(&self, tag: RelTag, lsn: Lsn) -> Result<()>;
/// Truncate SRLU segment
fn put_slru_truncate(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
// Get object tag greater or equal than specified
fn get_next_tag(&self, tag: ObjectTag) -> Result<Option<ObjectTag>>;
/// Unlink object. This method is used for marking dropped relations
/// and removed segments of SLRUs.
fn put_unlink(&self, tag: ObjectTag, lsn: Lsn) -> Result<()>;
/// Remember the all WAL before the given LSN has been processed.
///
@@ -135,13 +112,6 @@ pub trait Timeline: Send + Sync {
// TODO ordering guarantee?
fn history<'a>(&'a self) -> Result<Box<dyn History + 'a>>;
/// Perform one garbage collection iteration.
/// Garbage collection is periodically performed by GC thread,
/// but it can be explicitly requested through page server API.
///
/// `horizon` specifies delta from last LSN to preserve all object versions (PITR interval).
fn gc_iteration(&self, horizon: u64) -> Result<GcResult>;
// Check transaction status
fn get_tx_status(&self, xid: TransactionId, lsn: Lsn) -> anyhow::Result<u8> {
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -152,24 +122,26 @@ pub trait Timeline: Send + Sync {
}
}
pub trait History: Iterator<Item = Result<RelationUpdate>> {
pub trait History: Iterator<Item = Result<Modification>> {
/// The last_valid_lsn at the time of history() call.
fn lsn(&self) -> Lsn;
}
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct RelationUpdate {
pub rel: RelTag,
pub struct Modification {
pub tag: ObjectTag,
pub lsn: Lsn,
pub update: Update,
pub data: Vec<u8>,
}
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum Update {
Page { blknum: u32, img: Bytes },
WALRecord { blknum: u32, rec: WALRecord },
Truncate { n_blocks: u32 },
Unlink,
impl Modification {
pub fn new(entry: (ObjectTag, Lsn, Vec<u8>)) -> Modification {
Modification {
tag: entry.0,
lsn: entry.1,
data: entry.2,
}
}
}
#[derive(Clone)]
@@ -195,6 +167,10 @@ pub struct RepositoryStats {
/// are used for the same purpose.
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
///
/// We use additional fork numbers to logically separate relational and
/// non-relational data inside pageserver key-value storage.
/// See, e.g., `ROCKSDB_SPECIAL_FORKNUM`.
///
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
pub struct RelTag {
pub forknum: u8,
@@ -237,6 +213,8 @@ impl fmt::Display for RelTag {
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
///
/// NOTE: In this context we use buffer, block and page interchangeably when speak about relation files.
///
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
pub struct BufferTag {
pub rel: RelTag,
@@ -250,6 +228,71 @@ impl BufferTag {
};
}
///
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and pg_multixact)
/// in Postgres are handled by SLRU (Simple LRU) buffer, hence the name.
///
/// These files are global for a postgres instance.
///
/// These files are divided into segments, which are divided into pages
/// of the same BLCKSZ as used for relation files.
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct SlruBufferTag {
pub blknum: u32,
}
///
/// Special type of Postgres files: pg_filenode.map is needed to map
/// catalog table OIDs to filenode numbers, which define filename.
///
/// Each database has a map file for its local mapped catalogs,
/// and there is a separate map file for shared catalogs.
///
/// These files have untypical size of 512 bytes.
///
/// See PostgreSQL relmapper.c for details.
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct DatabaseTag {
pub spcnode: u32,
pub dbnode: u32,
}
///
/// Non-relation files that keep state for prepared transactions.
/// Unlike other files these are not divided into pages.
///
/// See PostgreSQL twophase.c for details.
///
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct PrepareTag {
pub xid: TransactionId,
}
/// ObjectTag is a part of ObjectKey that is specific
/// to the type of the stored object.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub enum ObjectTag {
// dummy tag preceeding all other keys
FirstTag,
TimelineMetadataTag,
// Special entry that represents PostgreSQL checkpoint.
// We use it to track fields needed to restore controlfile checkpoint.
Checkpoint,
// Various types of non-relation files.
// We need them to bootstrap compute node.
ControlFile,
Clog(SlruBufferTag),
MultiXactMembers(SlruBufferTag),
MultiXactOffsets(SlruBufferTag),
FileNodeMap(DatabaseTag),
TwoPhase(PrepareTag),
// put relations at the end of enum to allow efficient iterations through non-rel objects
RelationMetadata(RelTag),
RelationBuffer(BufferTag),
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct WALRecord {
pub lsn: Lsn, // LSN at the *end* of the record
@@ -291,6 +334,7 @@ impl WALRecord {
mod tests {
use super::*;
use crate::object_repository::ObjectRepository;
use crate::object_repository::ObjectValue;
use crate::rocksdb_storage::RocksObjectStore;
use crate::walredo::{WalRedoError, WalRedoManager};
use crate::PageServerConf;
@@ -299,6 +343,7 @@ mod tests {
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
use zenith_utils::bin_ser::BeSer;
/// Arbitrary relation tag, for testing.
const TESTREL_A: RelTag = RelTag {
@@ -338,7 +383,7 @@ mod tests {
interactive: false,
gc_horizon: 64 * 1024 * 1024,
gc_period: Duration::from_secs(10),
listen_addr: "127.0.0.1:5430".to_string(),
listen_addr: "127.0.0.1:5430".parse().unwrap(),
workdir: repo_dir,
pg_distrib_dir: "".into(),
};
@@ -367,11 +412,11 @@ mod tests {
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
tline.init_valid_lsn(Lsn(1));
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"), true)?;
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"), true)?;
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"), true)?;
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"), true)?;
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"))?;
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"))?;
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"))?;
tline.advance_last_valid_lsn(Lsn(5));
@@ -458,7 +503,7 @@ mod tests {
for i in 0..pg_constants::RELSEG_SIZE + 1 {
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
lsn += 1;
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img, true)?;
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img)?;
}
tline.advance_last_valid_lsn(Lsn(lsn));
@@ -495,48 +540,82 @@ mod tests {
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
let mut snapshot = tline.history()?;
let snapshot = tline.history()?;
assert_eq!(snapshot.lsn(), Lsn(0));
let mut snapshot = snapshot.skip_while(|r| match r {
Ok(m) => match m.tag {
ObjectTag::RelationBuffer(_) => false,
_ => true,
},
_ => true,
});
assert_eq!(None, snapshot.next().transpose()?);
// add a page and advance the last valid LSN
let rel = TESTREL_A;
let tag = TEST_BUF(1);
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"), true)?;
tline.put_page_image(tag, Lsn(1), TEST_IMG("blk 1 @ lsn 1"))?;
tline.advance_last_valid_lsn(Lsn(1));
let mut snapshot = tline.history()?;
let snapshot = tline.history()?;
assert_eq!(snapshot.lsn(), Lsn(1));
let expected_page = RelationUpdate {
rel: rel,
lsn: Lsn(1),
update: Update::Page {
blknum: 1,
img: TEST_IMG("blk 1 @ lsn 1"),
let mut snapshot = snapshot.skip_while(|r| match r {
Ok(m) => match m.tag {
ObjectTag::RelationBuffer(_) => false,
_ => true,
},
_ => true,
});
let expected_page = Modification {
tag,
lsn: Lsn(1),
data: ObjectValue::ser(&ObjectValue::Page(TEST_IMG("blk 1 @ lsn 1")))?,
};
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
assert_eq!(None, snapshot.next().transpose()?);
// truncate to zero, but don't advance the last valid LSN
tline.put_truncation(rel, Lsn(2), 0)?;
let mut snapshot = tline.history()?;
let snapshot = tline.history()?;
assert_eq!(snapshot.lsn(), Lsn(1));
let mut snapshot = snapshot.skip_while(|r| match r {
Ok(m) => match m.tag {
ObjectTag::RelationBuffer(_) => false,
_ => true,
},
_ => true,
});
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
assert_eq!(None, snapshot.next().transpose()?);
// advance the last valid LSN and the truncation should be observable
tline.advance_last_valid_lsn(Lsn(2));
let mut snapshot = tline.history()?;
let snapshot = tline.history()?;
assert_eq!(snapshot.lsn(), Lsn(2));
// TODO ordering not guaranteed by API. But currently it returns the
// truncation entry before the block data.
let expected_truncate = RelationUpdate {
rel: rel,
lsn: Lsn(2),
update: Update::Truncate { n_blocks: 0 },
let mut snapshot = snapshot.skip_while(|r| match r {
Ok(m) => match m.tag {
ObjectTag::RelationMetadata(_) => false,
_ => true,
},
_ => true,
});
let expected_truncate = Modification {
tag: ObjectTag::RelationMetadata(rel),
lsn: Lsn(1),
data: ObjectValue::ser(&ObjectValue::RelationSize(2))?,
};
assert_eq!(Some(expected_truncate), snapshot.next().transpose()?);
assert_eq!(
Some(&expected_truncate),
snapshot.next().transpose()?.as_ref()
); // TODO ordering not guaranteed by API
let expected_truncate = Modification {
tag: ObjectTag::RelationMetadata(rel),
lsn: Lsn(2),
data: ObjectValue::ser(&ObjectValue::RelationSize(0))?,
};
assert_eq!(
Some(&expected_truncate),
snapshot.next().transpose()?.as_ref()
); // TODO ordering not guaranteed by API
assert_eq!(Some(&expected_page), snapshot.next().transpose()?.as_ref());
assert_eq!(None, snapshot.next().transpose()?);

View File

@@ -12,21 +12,21 @@ use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use anyhow::Result;
use bytes::{Buf, Bytes};
use bytes::Bytes;
use crate::object_key::*;
use crate::repository::*;
use crate::waldecoder::*;
use crate::repository::{
BufferTag, DatabaseTag, ObjectTag, PrepareTag, RelTag, SlruBufferTag, Timeline, WALRecord,
};
use crate::waldecoder::{decode_wal_record, DecodedWALRecord, Oid, WalStreamDecoder};
use crate::waldecoder::{XlCreateDatabase, XlSmgrTruncate};
use crate::PageServerConf;
use crate::ZTimelineId;
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::*;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::{pg_constants, CheckPoint, ControlFileData};
use postgres_ffi::*;
use zenith_utils::lsn::Lsn;
const MAX_MBR_BLKNO: u32 =
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
///
/// Find latest snapshot in a timeline's 'snapshots' directory
///
@@ -203,7 +203,7 @@ fn import_relfile(
},
blknum,
});
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf), true)?;
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf))?;
}
// TODO: UnexpectedEof is expected
@@ -236,7 +236,7 @@ fn import_nonrel_file(
// read the whole file
file.read_to_end(&mut buffer)?;
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]), false)?;
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]))?;
Ok(())
}
@@ -256,7 +256,7 @@ fn import_slru_file(
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
timeline.put_page_image(gen_tag(blknum), lsn, Bytes::copy_from_slice(&buf), false)?;
timeline.put_page_image(gen_tag(blknum), lsn, Bytes::copy_from_slice(&buf))?;
}
// TODO: UnexpectedEof is expected
@@ -289,8 +289,6 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
let checkpoint_bytes = timeline.get_page_at_lsn_nowait(ObjectTag::Checkpoint, startpoint)?;
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
// get_page_at_lsn_nowait returns pages with zeros when object is not found in the storage.
// nextXid can not be zero, so this check is used to detect situation when checkpoint record needs to be initialized.
if checkpoint.nextXid.value == 0 {
let pg_control_bytes =
timeline.get_page_at_lsn_nowait(ObjectTag::ControlFile, startpoint)?;
@@ -339,8 +337,8 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
break;
}
if let Some((lsn, recdata)) = rec.unwrap() {
let decoded = decode_wal_record(recdata.clone());
save_decoded_record(&mut checkpoint, timeline, &decoded, recdata, lsn)?;
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
save_decoded_record(timeline, &decoded, recdata, lsn)?;
last_lsn = lsn;
} else {
break;
@@ -360,7 +358,7 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
}
info!("reached end of WAL at {}", last_lsn);
let checkpoint_bytes = checkpoint.encode();
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes, false)?;
timeline.put_page_image(ObjectTag::Checkpoint, last_lsn, checkpoint_bytes)?;
Ok(())
}
@@ -369,163 +367,40 @@ pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint:
/// relations/pages that the record affects.
///
pub fn save_decoded_record(
checkpoint: &mut CheckPoint,
timeline: &dyn Timeline,
decoded: &DecodedWALRecord,
recdata: Bytes,
lsn: Lsn,
) -> Result<()> {
checkpoint.update_next_xid(decoded.xl_xid);
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
let tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
},
blknum: blk.blkno,
});
let rec = WALRecord {
lsn,
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(tag, rec)?;
if blk.will_drop {
timeline.put_unlink(blk.tag, lsn)?;
} else {
let rec = WALRecord {
lsn,
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(blk.tag, rec)?;
}
}
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
// Handle a few special record types
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE
{
let truncate = XlSmgrTruncate::decode(&mut buf);
let truncate = XlSmgrTruncate::decode(&decoded);
save_xlog_smgr_truncate(timeline, lsn, &truncate)?;
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE {
let createdb = XlCreateDatabase::decode(&mut buf);
save_xlog_dbase_create(timeline, lsn, &createdb)?;
} else {
// TODO
trace!("XLOG_DBASE_DROP is not handled yet");
}
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
let blknum = buf.get_u32_le();
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
if info == pg_constants::CLOG_ZEROPAGE {
let rec = WALRecord {
lsn,
will_init: true,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(tag, rec)?;
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
checkpoint.oldestXid = buf.get_u32_le();
checkpoint.oldestXidDB = buf.get_u32_le();
trace!(
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
blknum,
checkpoint.oldestXid,
checkpoint.oldestXidDB
);
if let Some(ObjectTag::Clog(first_slru_tag)) =
timeline.get_next_tag(ObjectTag::Clog(SlruBufferTag { blknum: 0 }))?
{
for trunc_blknum in first_slru_tag.blknum..=blknum {
let tag = ObjectTag::Clog(SlruBufferTag {
blknum: trunc_blknum,
});
timeline.put_slru_truncate(tag, lsn)?;
}
}
}
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_XACT_COMMIT
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|| info == pg_constants::XLOG_XACT_ABORT
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
save_xact_record(timeline, lsn, &parsed_xact, decoded)?;
} else if info == pg_constants::XLOG_XACT_PREPARE {
let rec = WALRecord {
lsn,
will_init: true,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(
ObjectTag::TwoPhase(PrepareTag {
xid: decoded.xl_xid,
}),
rec,
)?;
}
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
{
let blknum = buf.get_u32_le();
let rec = WALRecord {
lsn,
will_init: true,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
let tag = if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
ObjectTag::MultiXactOffsets(SlruBufferTag { blknum })
} else {
ObjectTag::MultiXactMembers(SlruBufferTag { blknum })
};
timeline.put_wal_record(tag, rec)?;
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
let xlrec = XlMultiXactTruncate::decode(&mut buf);
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
}
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
let xlrec = XlRelmapUpdate::decode(&mut buf);
save_relmap_record(timeline, lsn, &xlrec, decoded)?;
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_NEXTOID {
let next_oid = buf.get_u32_le();
checkpoint.nextOid = next_oid;
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
{
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
buf.copy_to_slice(&mut checkpoint_bytes);
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
trace!(
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
xlog_checkpoint.oldestXid,
checkpoint.oldestXid
);
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
}
}
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE
{
let createdb = XlCreateDatabase::decode(&decoded);
save_xlog_dbase_create(timeline, lsn, &createdb)?;
}
// Now that this record has been handled, let the repository know that
// it is up-to-date to this LSN
timeline.advance_last_record_lsn(lsn);
@@ -579,7 +454,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
debug!("copying block {:?} to {:?}", src_key, dst_key);
timeline.put_page_image(dst_key, lsn, content, true)?;
timeline.put_page_image(dst_key, lsn, content)?;
num_blocks_copied += 1;
}
@@ -600,7 +475,7 @@ fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatab
spcnode: tablespace_id,
dbnode: db_id,
});
timeline.put_page_image(new_tag, lsn, img, false)?;
timeline.put_page_image(new_tag, lsn, img)?;
break;
}
}
@@ -675,158 +550,3 @@ fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTrunca
}
Ok(())
}
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
///
/// We are currently only interested in the dropped relations.
fn save_xact_record(
timeline: &dyn Timeline,
lsn: Lsn,
parsed: &XlXactParsedRecord,
decoded: &DecodedWALRecord,
) -> Result<()> {
// Record update of CLOG page
let mut blknum = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
let rec = WALRecord {
lsn,
will_init: false,
rec: decoded.record.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
timeline.put_wal_record(tag, rec.clone())?;
for subxact in &parsed.subxacts {
let subxact_blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
if subxact_blknum != blknum {
blknum = subxact_blknum;
let tag = ObjectTag::Clog(SlruBufferTag { blknum });
timeline.put_wal_record(tag, rec.clone())?;
}
}
for xnode in &parsed.xnodes {
for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
let rel_tag = RelTag {
forknum,
spcnode: xnode.spcnode,
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
timeline.put_unlink(rel_tag, lsn)?;
}
}
Ok(())
}
fn save_multixact_create_record(
checkpoint: &mut CheckPoint,
timeline: &dyn Timeline,
lsn: Lsn,
xlrec: &XlMultiXactCreate,
decoded: &DecodedWALRecord,
) -> Result<()> {
let rec = WALRecord {
lsn,
will_init: false,
rec: decoded.record.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
timeline.put_wal_record(tag, rec.clone())?;
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let last_mbr_blkno =
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
// The members SLRU can, in contrast to the offsets one, be filled to almost
// the full range at once. So we need to handle wraparound.
let mut blknum = first_mbr_blkno;
loop {
// Update members page
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
timeline.put_wal_record(tag, rec.clone())?;
if blknum == last_mbr_blkno {
// last block inclusive
break;
}
// handle wraparound
if blknum == MAX_MBR_BLKNO {
blknum = 0;
} else {
blknum += 1;
}
}
if xlrec.mid >= checkpoint.nextMulti {
checkpoint.nextMulti = xlrec.mid + 1;
}
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
}
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
mbr.xid
} else {
acc
}
});
checkpoint.update_next_xid(max_mbr_xid);
Ok(())
}
fn save_multixact_truncate_record(
checkpoint: &mut CheckPoint,
timeline: &dyn Timeline,
lsn: Lsn,
xlrec: &XlMultiXactTruncate,
) -> Result<()> {
checkpoint.oldestMulti = xlrec.end_trunc_off;
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
let first_off_blkno = xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
let last_off_blkno = xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
// Delete all the segments except the last one. The last segment can still
// contain, possibly partially, valid data.
for blknum in first_off_blkno..last_off_blkno {
let tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
timeline.put_slru_truncate(tag, lsn)?;
}
let first_mbr_blkno = xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let last_mbr_blkno = xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
// The members SLRU can, in contrast to the offsets one, be filled to almost
// the full range at once. So we need to handle wraparound.
let mut blknum = first_mbr_blkno;
// Delete all the segments but the last one. The last segment can still
// contain, possibly partially, valid data.
while blknum != last_mbr_blkno {
let tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
timeline.put_slru_truncate(tag, lsn)?;
// handle wraparound
if blknum == MAX_MBR_BLKNO {
blknum = 0;
} else {
blknum += 1;
}
}
Ok(())
}
fn save_relmap_record(
timeline: &dyn Timeline,
lsn: Lsn,
xlrec: &XlRelmapUpdate,
decoded: &DecodedWALRecord,
) -> Result<()> {
let rec = WALRecord {
lsn,
will_init: true,
rec: decoded.record.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
let tag = ObjectTag::FileNodeMap(DatabaseTag {
spcnode: xlrec.tsid,
dbnode: xlrec.dbid,
});
timeline.put_wal_record(tag, rec)?;
Ok(())
}

View File

@@ -0,0 +1,274 @@
//
// Restore chunks from S3
//
// This runs once at Page Server startup. It loads all the "base images" from
// S3 into the in-memory page cache. It also initializes the "last valid LSN"
// in the page cache to the LSN of the base image, so that when the WAL receiver
// is started, it starts streaming from that LSN.
//
use bytes::{Buf, BytesMut};
use log::*;
use regex::Regex;
use std::env;
use std::fmt;
use s3::bucket::Bucket;
use s3::creds::Credentials;
use s3::region::Region;
use s3::S3Error;
use tokio::runtime;
use futures::future;
use crate::{page_cache, PageServerConf};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::*;
struct Storage {
region: Region,
credentials: Credentials,
bucket: String,
}
pub fn restore_main(conf: &PageServerConf) {
// Create a new thread pool
let runtime = runtime::Runtime::new().unwrap();
runtime.block_on(async {
let result = restore_chunk(conf).await;
match result {
Ok(_) => {}
Err(err) => {
error!("S3 error: {}", err);
}
}
});
}
//
// Restores one chunk from S3.
//
// 1. Fetch the last base image >= given LSN
// 2. Fetch all WAL
//
// Load it all into the page cache.
//
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
let backend = Storage {
region: Region::Custom {
region: env::var("S3_REGION").unwrap(),
endpoint: env::var("S3_ENDPOINT").unwrap(),
},
credentials: Credentials::new(
Some(&env::var("S3_ACCESSKEY").unwrap()),
Some(&env::var("S3_SECRET").unwrap()),
None,
None,
None,
)
.unwrap(),
bucket: "zenith-testbucket".to_string(),
};
info!("Restoring from S3...");
// Create Bucket in REGION for BUCKET
let bucket = Bucket::new_with_path_style(&backend.bucket, backend.region, backend.credentials)?;
// List out contents of directory
let results: Vec<s3::serde_types::ListBucketResult> = bucket
.list("relationdata/".to_string(), Some("".to_string()))
.await?;
// TODO: get that from backup
let sys_id: u64 = 42;
let mut oldest_lsn = 0;
let mut slurp_futures: Vec<_> = Vec::new();
for result in results {
for object in result.contents {
// Download every relation file, slurping them into memory
let key = object.key;
let relpath = key.strip_prefix("relationdata/").unwrap();
let parsed = parse_rel_file_path(&relpath);
match parsed {
Ok(p) => {
if oldest_lsn == 0 || p.lsn < oldest_lsn {
oldest_lsn = p.lsn;
}
let b = bucket.clone();
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
slurp_futures.push(f);
}
Err(e) => {
warn!("unrecognized file: {} ({})", relpath, e);
}
};
}
}
if oldest_lsn == 0 {
panic!("no base backup found");
}
let pcache = page_cache::get_pagecache(conf, sys_id);
pcache.init_valid_lsn(oldest_lsn);
info!("{} files to restore...", slurp_futures.len());
future::join_all(slurp_futures).await;
info!("restored!");
Ok(())
}
#[derive(Debug)]
struct ParsedBaseImageFileName {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u8,
pub segno: u32,
pub lsn: u64,
}
// formats:
// <oid>
// <oid>_<fork name>
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
let caps = re
.captures(fname)
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode: u32 = relnode_str.parse()?;
let forkname = caps.name("forkname").map(|f| f.as_str());
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
0
} else {
segno_match.unwrap().as_str().parse::<u32>()?
};
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
let lsn = lsn_hi << 32 | lsn_lo;
Ok((relnode, forknum, segno, lsn))
}
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
/*
* Relation data files can be in one of the following directories:
*
* global/
* shared relations
*
* base/<db oid>/
* regular relations, default tablespace
*
* pg_tblspc/<tblspc oid>/<tblspc version>/
* within a non-default tablespace (the name of the directory
* depends on version)
*
* And the relation data files themselves have a filename like:
*
* <oid>.<segment number>
*/
if let Some(fname) = path.strip_prefix("global/") {
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
Ok(ParsedBaseImageFileName {
spcnode: pg_constants::GLOBALTABLESPACE_OID,
dbnode: 0,
relnode,
forknum,
segno,
lsn,
})
} else if let Some(dbpath) = path.strip_prefix("base/") {
let mut s = dbpath.split("/");
let dbnode_str = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let dbnode: u32 = dbnode_str.parse()?;
let fname = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
if s.next().is_some() {
return Err(FilePathError::new("invalid relation data file name"));
};
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
Ok(ParsedBaseImageFileName {
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
dbnode,
relnode,
forknum,
segno,
lsn,
})
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
// TODO
Err(FilePathError::new("tablespaces not supported"))
} else {
Err(FilePathError::new("invalid relation data file name"))
}
}
//
// Load a base file from S3, and insert it into the page cache
//
async fn slurp_base_file(
conf: &PageServerConf,
sys_id: u64,
bucket: Bucket,
s3path: String,
parsed: ParsedBaseImageFileName,
) {
// FIXME: rust-s3 opens a new connection for each request. Should reuse
// the reqwest::Client object. But that requires changes to rust-s3 itself.
let (data, code) = bucket.get_object(s3path.clone()).await.unwrap();
trace!("got response: {} on {}", code, &s3path);
assert_eq!(200, code);
let mut bytes = BytesMut::from(data.as_slice()).freeze();
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
let pcache = page_cache::get_pagecache(conf, sys_id);
while bytes.remaining() >= 8192 {
let tag = page_cache::BufferTag {
rel: page_cache::RelTag {
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum,
},
blknum,
};
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
blknum += 1;
}
}

View File

@@ -1,9 +1,8 @@
//!
//! An implementation of the ObjectStore interface, backed by RocksDB
//!
use crate::object_key::*;
use crate::object_store::ObjectStore;
use crate::repository::RelTag;
use crate::object_store::{ObjectKey, ObjectStore};
use crate::repository::{BufferTag, ObjectTag, RelTag};
use crate::PageServerConf;
use crate::ZTimelineId;
use anyhow::{bail, Result};
@@ -25,7 +24,7 @@ impl StorageKey {
Self {
obj_key: ObjectKey {
timeline,
tag: ObjectTag::TimelineMetadataTag,
tag: ObjectTag::FirstTag,
},
lsn: Lsn(0),
}
@@ -94,21 +93,6 @@ impl ObjectStore for RocksObjectStore {
}
}
fn get_next_key(&self, key: &ObjectKey) -> Result<Option<ObjectKey>> {
let mut iter = self.db.raw_iterator();
let search_key = StorageKey {
obj_key: key.clone(),
lsn: Lsn(0),
};
iter.seek(search_key.ser()?);
if !iter.valid() {
Ok(None)
} else {
let key = StorageKey::des(iter.key().unwrap())?;
Ok(Some(key.obj_key.clone()))
}
}
fn put(&self, key: &ObjectKey, lsn: Lsn, value: &[u8]) -> Result<()> {
self.db.put(
StorageKey::ser(&StorageKey {
@@ -167,43 +151,41 @@ impl ObjectStore for RocksObjectStore {
let mut rels: HashSet<RelTag> = HashSet::new();
let mut search_rel_tag = RelTag {
spcnode,
dbnode,
relnode: 0,
forknum: 0u8,
let mut search_key = StorageKey {
obj_key: ObjectKey {
timeline: timelineid,
tag: ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
spcnode,
dbnode,
relnode: 0,
forknum: 0u8,
},
blknum: 0,
}),
},
lsn: Lsn(0),
};
let mut iter = self.db.raw_iterator();
loop {
let search_key = StorageKey {
obj_key: ObjectKey {
timeline: timelineid,
tag: ObjectTag::RelationMetadata(search_rel_tag),
},
lsn: Lsn(0),
};
iter.seek(search_key.ser()?);
if !iter.valid() {
break;
}
let key = StorageKey::des(iter.key().unwrap())?;
if let ObjectTag::RelationMetadata(rel_tag) = key.obj_key.tag {
if spcnode != 0 && rel_tag.spcnode != spcnode
|| dbnode != 0 && rel_tag.dbnode != dbnode
if let ObjectTag::RelationBuffer(buf_tag) = key.obj_key.tag {
if (spcnode != 0 && buf_tag.rel.spcnode != spcnode)
|| (dbnode != 0 && buf_tag.rel.dbnode != dbnode)
{
break;
}
if key.lsn <= lsn {
// visible in this snapshot
rels.insert(rel_tag);
if key.lsn < lsn {
rels.insert(buf_tag.rel);
}
search_rel_tag = rel_tag;
// skip to next relation
// FIXME: What if relnode is u32::MAX ?
search_rel_tag.relnode += 1;
let mut next_tag = buf_tag.clone();
next_tag.rel.relnode += 1; // skip to next relation
search_key.obj_key.tag = ObjectTag::RelationBuffer(next_tag);
} else {
// no more relation metadata entries
break;
}
}
@@ -233,10 +215,6 @@ impl ObjectStore for RocksObjectStore {
lsn,
}))
}
fn compact(&self) {
self.db.compact_range::<&[u8], &[u8]>(None, None);
}
}
impl RocksObjectStore {

View File

@@ -2,14 +2,17 @@
//! WAL decoder. For each WAL record, it decodes the record to figure out which data blocks
//! the record affects, to add the records to the page cache.
//!
use crate::repository::*;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use log::*;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::CheckPoint;
use postgres_ffi::XLogLongPageHeaderData;
use postgres_ffi::XLogPageHeaderData;
use postgres_ffi::XLogRecord;
use std::cmp::min;
use std::str;
use thiserror::Error;
use zenith_utils::lsn::Lsn;
@@ -21,6 +24,9 @@ pub type MultiXactId = TransactionId;
pub type MultiXactOffset = u32;
pub type MultiXactStatus = u32;
const MAX_MBR_BLKNO: u32 =
pg_constants::MAX_MULTIXACT_ID / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
#[allow(dead_code)]
pub struct WalStreamDecoder {
lsn: Lsn,
@@ -196,12 +202,7 @@ pub struct DecodedBkpBlock {
//in_use: bool,
/* Identify the block this refers to */
pub rnode_spcnode: u32,
pub rnode_dbnode: u32,
pub rnode_relnode: u32,
// Note that we have a few special forknum values for non-rel files.
pub forknum: u8,
pub blkno: u32,
pub tag: ObjectTag,
/* copy of the fork_flags field from the XLogRecordBlockHeader */
flags: u8,
@@ -210,6 +211,7 @@ pub struct DecodedBkpBlock {
has_image: bool, /* has image, even for consistency checking */
pub apply_image: bool, /* has image that should be restored */
pub will_init: bool, /* record doesn't need previous page version to apply */
pub will_drop: bool, /* record drops relation */
//char *bkp_image;
hole_offset: u16,
hole_length: u16,
@@ -224,16 +226,13 @@ pub struct DecodedBkpBlock {
impl DecodedBkpBlock {
pub fn new() -> DecodedBkpBlock {
DecodedBkpBlock {
rnode_spcnode: 0,
rnode_dbnode: 0,
rnode_relnode: 0,
forknum: 0,
blkno: 0,
tag: ObjectTag::FirstTag,
flags: 0,
has_image: false,
apply_image: false,
will_init: false,
will_drop: false,
hole_offset: 0,
hole_length: 0,
bimg_len: 0,
@@ -246,7 +245,6 @@ impl DecodedBkpBlock {
}
pub struct DecodedWALRecord {
pub xl_xid: TransactionId,
pub xl_info: u8,
pub xl_rmid: u8,
pub record: Bytes, // raw XLogRecord
@@ -290,7 +288,9 @@ pub struct XlSmgrTruncate {
}
impl XlSmgrTruncate {
pub fn decode(buf: &mut Bytes) -> XlSmgrTruncate {
pub fn decode(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
let mut buf = decoded.record.clone();
buf.advance((XLOG_SIZE_OF_XLOG_RECORD + 2) as usize);
XlSmgrTruncate {
blkno: buf.get_u32_le(),
rnode: RelFileNode {
@@ -313,7 +313,9 @@ pub struct XlCreateDatabase {
}
impl XlCreateDatabase {
pub fn decode(buf: &mut Bytes) -> XlCreateDatabase {
pub fn decode(decoded: &DecodedWALRecord) -> XlCreateDatabase {
let mut buf = decoded.record.clone();
buf.advance((XLOG_SIZE_OF_XLOG_RECORD + 2) as usize);
XlCreateDatabase {
db_id: buf.get_u32_le(),
tablespace_id: buf.get_u32_le(),
@@ -399,103 +401,6 @@ impl XlHeapUpdate {
}
}
///
/// Note: Parsing some fields is missing, because they're not needed.
///
/// This is similar to the xl_xact_parsed_commit and
/// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
/// struct for commits and aborts.
///
#[derive(Debug)]
pub struct XlXactParsedRecord {
pub xid: TransactionId,
pub info: u8,
pub xact_time: TimestampTz,
pub xinfo: u32,
pub db_id: Oid, /* MyDatabaseId */
pub ts_id: Oid, /* MyDatabaseTableSpace */
pub subxacts: Vec<TransactionId>,
pub xnodes: Vec<RelFileNode>,
}
impl XlXactParsedRecord {
/// Decode a XLOG_XACT_COMMIT/ABORT/COMMIT_PREPARED/ABORT_PREPARED
/// record. This should agree with the ParseCommitRecord and ParseAbortRecord
/// functions in PostgreSQL (in src/backend/access/rmgr/xactdesc.c)
pub fn decode(buf: &mut Bytes, mut xid: TransactionId, xl_info: u8) -> XlXactParsedRecord {
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
// The record starts with time of commit/abort
let xact_time = buf.get_u64_le();
let xinfo;
if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
} else {
xinfo = 0;
}
let db_id;
let ts_id;
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
db_id = buf.get_u32_le();
ts_id = buf.get_u32_le();
} else {
db_id = 0;
ts_id = 0;
}
let mut subxacts = Vec::<TransactionId>::new();
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
subxacts.push(subxact);
}
}
let mut xnodes = Vec::<RelFileNode>::new();
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
xnodes.push(RelFileNode {
spcnode,
dbnode,
relnode,
});
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
xid = buf.get_u32_le();
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
}
XlXactParsedRecord {
xid,
info,
xact_time,
xinfo,
db_id,
ts_id,
subxacts,
xnodes,
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct MultiXactMember {
@@ -542,14 +447,14 @@ impl XlMultiXactCreate {
#[repr(C)]
#[derive(Debug)]
pub struct XlMultiXactTruncate {
pub oldest_multi_db: Oid,
oldest_multi_db: Oid,
/* to-be-truncated range of multixact offsets */
pub start_trunc_off: MultiXactId, /* just for completeness' sake */
pub end_trunc_off: MultiXactId,
start_trunc_off: MultiXactId, /* just for completeness' sake */
end_trunc_off: MultiXactId,
/* to-be-truncated range of multixact members */
pub start_trunc_memb: MultiXactOffset,
pub end_trunc_memb: MultiXactOffset,
start_trunc_memb: MultiXactOffset,
end_trunc_memb: MultiXactOffset,
}
impl XlMultiXactTruncate {
@@ -582,10 +487,11 @@ impl XlMultiXactTruncate {
// block data
// ...
// main data
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
let mut rnode_spcnode: u32 = 0;
let mut rnode_dbnode: u32 = 0;
let mut rnode_relnode: u32 = 0;
pub fn decode_wal_record(checkpoint: &mut CheckPoint, record: Bytes) -> DecodedWALRecord {
let mut spcnode: u32 = 0;
let mut dbnode: u32 = 0;
let mut relnode: u32 = 0;
let mut forknum: u8;
let mut got_rnode = false;
let mut buf = record.clone();
@@ -601,6 +507,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
xlogrec.xl_info
);
checkpoint.update_next_xid(xlogrec.xl_xid);
let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
if buf.remaining() != remaining {
@@ -659,7 +566,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
max_block_id = block_id;
fork_flags = buf.get_u8();
blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
blk.flags = fork_flags;
blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
@@ -765,9 +672,9 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
}
}
if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
rnode_spcnode = buf.get_u32_le();
rnode_dbnode = buf.get_u32_le();
rnode_relnode = buf.get_u32_le();
spcnode = buf.get_u32_le();
dbnode = buf.get_u32_le();
relnode = buf.get_u32_le();
got_rnode = true;
} else if !got_rnode {
// TODO
@@ -778,18 +685,16 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
goto err; */
}
blk.rnode_spcnode = rnode_spcnode;
blk.rnode_dbnode = rnode_dbnode;
blk.rnode_relnode = rnode_relnode;
blk.blkno = buf.get_u32_le();
trace!(
"this record affects {}/{}/{} blk {}",
rnode_spcnode,
rnode_dbnode,
rnode_relnode,
blk.blkno
);
blk.tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
forknum,
spcnode,
dbnode,
relnode,
},
blknum: buf.get_u32_le(),
});
trace!("this record affects {:?}", blk.tag);
blocks.push(blk);
}
@@ -811,11 +716,226 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
assert_eq!(buf.remaining(), main_data_len as usize);
}
// 5. Handle a few special record types that modify blocks without registering
// them with the standard mechanism.
if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
//5. Handle special CLOG and XACT records
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
let mut blk = DecodedBkpBlock::new();
let blknum = buf.get_i32_le() as u32;
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
blk.will_init = true;
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
blk.will_drop = true;
checkpoint.oldestXid = buf.get_u32_le();
checkpoint.oldestXidDB = buf.get_u32_le();
trace!(
"RM_CLOG_ID truncate blkno {} oldestXid {} oldestXidDB {}",
blknum,
checkpoint.oldestXid,
checkpoint.oldestXidDB
);
}
trace!("RM_CLOG_ID updates block {}", blknum);
blocks.push(blk);
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED
{
if info == pg_constants::XLOG_XACT_COMMIT {
let mut blk = DecodedBkpBlock::new();
let blknum = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
trace!(
"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
xlogrec.xl_prev & 0xffffffff,
xlogrec.xl_xid,
blknum,
main_data_len
);
blocks.push(blk);
}
//parse commit record to extract subtrans entries
// xl_xact_commit starts with time of commit
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
let mut prev_blknum = u32::MAX;
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
if prev_blknum != blknum {
prev_blknum = blknum;
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
blocks.push(blk);
}
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationMetadata(RelTag {
forknum: pg_constants::MAIN_FORKNUM,
spcnode,
dbnode,
relnode,
});
blk.will_drop = true;
blocks.push(blk);
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
let xid = buf.get_u32_le();
let mut blk = DecodedBkpBlock::new();
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
blocks.push(blk);
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
//TODO handle this to be able to restore pg_twophase on node start
}
} else if info == pg_constants::XLOG_XACT_ABORT
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
if info == pg_constants::XLOG_XACT_ABORT {
let mut blk = DecodedBkpBlock::new();
let blknum = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
trace!(
"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
xlogrec.xl_prev & 0xffffffff,
xlogrec.xl_xid,
blknum,
main_data_len
);
blocks.push(blk);
}
//parse abort record to extract subtrans entries
// xl_xact_abort starts with time of commit
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
let mut prev_blknum = u32::MAX;
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blknum = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
if prev_blknum != blknum {
prev_blknum = blknum;
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
blocks.push(blk);
}
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationMetadata(RelTag {
forknum: pg_constants::MAIN_FORKNUM,
spcnode,
dbnode,
relnode,
});
blk.will_drop = true;
blocks.push(blk);
trace!(
"XLOG_XACT_ABORT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
let xid = buf.get_u32_le();
let mut blk = DecodedBkpBlock::new();
let blknum = xid / pg_constants::CLOG_XACTS_PER_PAGE;
blk.tag = ObjectTag::Clog(SlruBufferTag { blknum });
blocks.push(blk);
trace!("XLOG_XACT_ABORT-XACT_XINFO_HAS_TWOPHASE");
}
} else if info == pg_constants::XLOG_XACT_PREPARE {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::TwoPhase(PrepareTag {
xid: xlogrec.xl_xid,
});
blk.will_init = true;
blocks.push(blk);
debug!("Prepare transaction {}", xlogrec.xl_xid);
}
} else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::XLOG_DBASE_CREATE {
//buf points to main_data
let db_id = buf.get_u32_le();
let tablespace_id = buf.get_u32_le();
let src_db_id = buf.get_u32_le();
let src_tablespace_id = buf.get_u32_le();
trace!(
"XLOG_DBASE_CREATE tablespace_id/db_id {}/{} src_db_id {}/{}",
tablespace_id,
db_id,
src_tablespace_id,
src_db_id
);
// in postgres it is implemented as copydir
// we need to copy all pages in page_cache
} else {
trace!("XLOG_DBASE_DROP is not handled yet");
}
} else if xlogrec.xl_rmid == pg_constants::RM_TBLSPC_ID {
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::XLOG_TBLSPC_CREATE {
//buf points to main_data
let ts_id = buf.get_u32_le();
let ts_path = str::from_utf8(&buf).unwrap();
trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
} else {
trace!("XLOG_TBLSPC_DROP is not handled yet");
}
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_HEAP_INSERT {
let xlrec = XlHeapInsert::decode(&mut buf);
if (xlrec.flags
@@ -823,52 +943,96 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
!= 0
{
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: tag0.rel.spcnode,
dbnode: tag0.rel.dbnode,
relnode: tag0.rel.relnode,
},
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
});
blocks.push(blk);
} else {
panic!(
"Block 0 is expected to be relation buffer tag but it is {:?}",
blocks[0].tag
);
}
}
} else if info == pg_constants::XLOG_HEAP_DELETE {
let xlrec = XlHeapDelete::decode(&mut buf);
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: tag0.rel.spcnode,
dbnode: tag0.rel.dbnode,
relnode: tag0.rel.relnode,
},
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
});
blocks.push(blk);
} else {
panic!(
"Block 0 is expected to be relation buffer tag but it is {:?}",
blocks[0].tag
);
}
}
} else if info == pg_constants::XLOG_HEAP_UPDATE
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
{
let xlrec = XlHeapUpdate::decode(&mut buf);
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: tag0.rel.spcnode,
dbnode: tag0.rel.dbnode,
relnode: tag0.rel.relnode,
},
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
});
blocks.push(blk);
} else {
panic!(
"Block 0 is expected to be relation buffer tag but it is {:?}",
blocks[0].tag
);
}
}
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
&& blocks.len() > 1
{
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
blk.rnode_spcnode = blocks[1].rnode_spcnode;
blk.rnode_dbnode = blocks[1].rnode_dbnode;
blk.rnode_relnode = blocks[1].rnode_relnode;
blocks.push(blk);
if let ObjectTag::RelationBuffer(tag1) = blocks[1].tag {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: tag1.rel.spcnode,
dbnode: tag1.rel.dbnode,
relnode: tag1.rel.relnode,
},
blknum: tag1.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
});
blocks.push(blk);
} else {
panic!(
"Block 1 is expected to be relation buffer tag but it is {:?}",
blocks[1].tag
);
}
}
}
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_HEAP_OPMASK;
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
let xlrec = XlHeapMultiInsert::decode(&mut buf);
if (xlrec.flags
@@ -876,20 +1040,164 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
!= 0
{
if let ObjectTag::RelationBuffer(tag0) = blocks[0].tag {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::RelationBuffer(BufferTag {
rel: RelTag {
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
spcnode: tag0.rel.spcnode,
dbnode: tag0.rel.dbnode,
relnode: tag0.rel.relnode,
},
blknum: tag0.blknum / pg_constants::HEAPBLOCKS_PER_PAGE as u32,
});
blocks.push(blk);
} else {
panic!(
"Block 0 is expected to be relation buffer tag but it is {:?}",
blocks[0].tag
);
}
}
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag {
blknum: buf.get_u32_le(),
});
blk.will_init = true;
blocks.push(blk);
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag {
blknum: buf.get_u32_le(),
});
blk.will_init = true;
blocks.push(blk);
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
// Update offset page
let mut blk = DecodedBkpBlock::new();
let blknum = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
blocks.push(blk);
let first_mbr_blkno = xlrec.moff / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let last_mbr_blkno =
(xlrec.moff + xlrec.nmembers - 1) / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
// The members SLRU can, in contrast to the offsets one, be filled to almost
// the full range at once. So we need to handle wraparound.
let mut blknum = first_mbr_blkno;
loop {
// Update members page
let mut blk = DecodedBkpBlock::new();
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
blocks.push(blk);
if blknum == last_mbr_blkno {
// last block inclusive
break;
}
// handle wraparound
if blknum == MAX_MBR_BLKNO {
blknum = 0;
} else {
blknum += 1;
}
}
if xlrec.mid >= checkpoint.nextMulti {
checkpoint.nextMulti = xlrec.mid + 1;
}
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
}
let max_mbr_xid =
xlrec.members.iter().fold(
0u32,
|acc, mbr| {
if mbr.xid > acc {
mbr.xid
} else {
acc
}
},
);
checkpoint.update_next_xid(max_mbr_xid);
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
let xlrec = XlMultiXactTruncate::decode(&mut buf);
checkpoint.oldestMulti = xlrec.end_trunc_off;
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
let first_off_blkno =
xlrec.start_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
let last_off_blkno =
xlrec.end_trunc_off / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
// Delete all the segments but the last one. The last segment can still
// contain, possibly partially, valid data.
for blknum in first_off_blkno..last_off_blkno {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::MultiXactOffsets(SlruBufferTag { blknum });
blk.will_drop = true;
blocks.push(blk);
}
let first_mbr_blkno =
xlrec.start_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
let last_mbr_blkno =
xlrec.end_trunc_memb / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
// The members SLRU can, in contrast to the offsets one, be filled to almost
// the full range at once. So we need to handle wraparound.
let mut blknum = first_mbr_blkno;
// Delete all the segments but the last one. The last segment can still
// contain, possibly partially, valid data.
while blknum != last_mbr_blkno {
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::MultiXactMembers(SlruBufferTag { blknum });
blk.will_drop = true;
blocks.push(blk);
// handle wraparound
if blknum == MAX_MBR_BLKNO {
blknum = 0;
} else {
blknum += 1;
}
}
} else {
panic!()
}
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
let xlrec = XlRelmapUpdate::decode(&mut buf);
let mut blk = DecodedBkpBlock::new();
blk.tag = ObjectTag::FileNodeMap(DatabaseTag {
spcnode: xlrec.tsid,
dbnode: xlrec.dbid,
});
blk.will_init = true;
blocks.push(blk);
} else if xlogrec.xl_rmid == pg_constants::RM_XLOG_ID {
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_NEXTOID {
let next_oid = buf.get_u32_le();
if next_oid > checkpoint.nextOid {
checkpoint.nextOid = next_oid;
}
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
{
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
buf.copy_to_slice(&mut checkpoint_bytes);
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes).unwrap();
trace!(
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
xlog_checkpoint.oldestXid, checkpoint.oldestXid
);
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
}
}
}
DecodedWALRecord {
xl_xid: xlogrec.xl_xid,
xl_info: xlogrec.xl_info,
xl_rmid: xlogrec.xl_rmid,
record,

View File

@@ -4,8 +4,8 @@
//!
//! We keep one WAL receiver active per timeline.
use crate::object_key::*;
use crate::page_cache;
use crate::repository::*;
use crate::restore_local_repo;
use crate::waldecoder::*;
use crate::PageServerConf;
@@ -190,26 +190,17 @@ fn walreceiver_main(
waldecoder.feed_bytes(data);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// Save old checkpoint value to compare with it after decoding WAL record
let old_checkpoint_bytes = checkpoint.encode();
let decoded = decode_wal_record(recdata.clone());
restore_local_repo::save_decoded_record(
&mut checkpoint,
&*timeline,
&decoded,
recdata,
lsn,
)?;
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
restore_local_repo::save_decoded_record(&*timeline, &decoded, recdata, lsn)?;
last_rec_lsn = lsn;
let new_checkpoint_bytes = checkpoint.encode();
// Check if checkpoint data was updated by save_decoded_record
if new_checkpoint_bytes != old_checkpoint_bytes {
timeline.put_page_image(
ObjectTag::Checkpoint,
lsn,
new_checkpoint_bytes,
false,
)?;
}
}

View File

@@ -18,6 +18,7 @@
use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use log::*;
use std::assert;
use std::cell::RefCell;
use std::fs;
use std::fs::OpenOptions;
@@ -36,10 +37,7 @@ use tokio::time::timeout;
use zenith_utils::bin_ser::BeSer;
use zenith_utils::lsn::Lsn;
use crate::object_key::*;
use crate::repository::BufferTag;
use crate::repository::WALRecord;
use crate::waldecoder::XlXactParsedRecord;
use crate::repository::{BufferTag, ObjectTag, WALRecord};
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
use crate::PageServerConf;
use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
@@ -173,7 +171,6 @@ impl WalRedoManager for PostgresRedoManager {
) -> Result<Bytes, WalRedoError> {
// Create a channel where to receive the response
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
let request = WalRedoRequest {
tag,
lsn,
@@ -181,7 +178,6 @@ impl WalRedoManager for PostgresRedoManager {
records,
response_channel: tx,
};
self.request_tx
.lock()
.unwrap()
@@ -313,57 +309,150 @@ impl PostgresRedoManagerInternal {
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
// Transaction manager stuff
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
let mut status = 0;
let tag_blknum = match tag {
ObjectTag::Clog(slru) => slru.blknum,
ObjectTag::TwoPhase(_) => {
assert!(info == pg_constants::XLOG_XACT_PREPARE);
trace!("Apply prepare {} record", xlogrec.xl_xid);
page.clear();
page.extend_from_slice(&buf[..]);
continue;
0 // not used by XLOG_XACT_PREPARE
}
_ => panic!("Not valid XACT object tag {:?}", tag),
};
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
if info == pg_constants::XLOG_XACT_COMMIT
|| info == pg_constants::XLOG_XACT_COMMIT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_COMMITTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag_blknum == blkno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_SUB_COMMITTED,
&mut page,
);
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
if info == pg_constants::XLOG_XACT_COMMIT {
// status of 2PC transaction will be set later
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
}
let _xact_time = buf.get_i64_le();
// decode xinfo
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
}
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
transaction_id_set_status(
parsed_xact.xid,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
for subxact in &parsed_xact.subxacts {
let blkno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag_blknum == blkno {
transaction_id_set_status(
*subxact,
pg_constants::TRANSACTION_STATUS_ABORTED,
&mut page,
);
// handle subtrans
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag_blknum == blkno {
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
transaction_id_set_status(subxact, status, &mut page);
}
}
}
if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
// Do not need to handle dropped relations here, just need to skip them
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
//TODO handle this too?
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
// Skip invalidations
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
// Set status of 2PC transaction
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
let xid = buf.get_u32_le();
transaction_id_set_status(xid, status, &mut page);
}
} else if info == pg_constants::XLOG_XACT_ABORT
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
status = pg_constants::TRANSACTION_STATUS_ABORTED;
if info == pg_constants::XLOG_XACT_ABORT {
// status of 2PC transaction will be set later
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
}
//handle subtrans
let _xact_time = buf.get_i64_le();
// decode xinfo
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
}
// handle subtrans
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag_blknum == blkno {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
transaction_id_set_status(subxact, status, &mut page);
}
}
}
if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
// Do not need to handle dropped relations here, just need to skip them
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
//TODO handle this too?
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
// Skip invalidations
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
// Set status of 2PC transaction
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
let xid = buf.get_u32_le();
transaction_id_set_status(xid, status, &mut page);
}
} else if info == pg_constants::XLOG_XACT_PREPARE {
trace!("Apply prepare {} record", xlogrec.xl_xid);
page.clear();
page.extend_from_slice(&buf[..]);
} else {
error!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
status,
record.lsn,
record.main_data_offset, record.rec.len());
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
// Multiexact operations
@@ -371,7 +460,7 @@ impl PostgresRedoManagerInternal {
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
{
// Just need to zero page
// Just need to ero page
page.copy_from_slice(&ZERO_PAGE);
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);

View File

@@ -15,6 +15,7 @@ fn main() {
// All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
//
.header("pg_control_ffi.h")
.header("xlog_ffi.h")
//
// Tell cargo to invalidate the built crate whenever any of the
// included header files changed.

View File

@@ -67,7 +67,6 @@ pub const SLRU_SEG_SIZE: usize = BLCKSZ as usize * SLRU_PAGES_PER_SEGMENT as usi
/* mask for filtering opcodes out of xl_info */
pub const XLOG_XACT_OPMASK: u8 = 0x70;
pub const XLOG_HEAP_OPMASK: u8 = 0x70;
/* does this record have a 'xinfo' field or not */
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;

View File

@@ -390,13 +390,6 @@ impl CheckPoint {
}
}
//
// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
// We need this segment to start compute node.
// In order to minimize changes in Postgres core, we prefer to
// provide WAL segment from which is can extract checkpoint record in standard way,
// rather then implement some alternative mechanism.
//
pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

3
postgres_ffi/xlog_ffi.h Normal file
View File

@@ -0,0 +1,3 @@
#include "c.h"
#include "access/xlog_internal.h"
#include "access/xlogrecord.h"

View File

@@ -1,97 +0,0 @@
from contextlib import closing
import psycopg2.extras
pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test Garbage Collection of old page versions.
#
# This test is pretty tightly coupled with the current implementation of page version storage
# and garbage collection in object_repository.rs.
#
def test_gc(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_gc", "empty"])
pg = postgres.create_start('test_gc')
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
with closing(pageserver.connect()) as psconn:
with psconn.cursor(cursor_factory = psycopg2.extras.DictCursor) as pscur:
# Get the timeline ID of our branch. We need it for the 'do_gc' command
cur.execute("SHOW zenith.zenith_timeline")
timeline = cur.fetchone()[0]
# Create a test table
cur.execute("CREATE TABLE foo(x integer)")
# Run GC, to clear out any old page versions left behind in the catalogs by
# the CREATE TABLE command. We want to have a clean slate with no garbage
# before running the actual tests below, otherwise the counts won't match
# what we expect.
print("Running GC before test")
pscur.execute(f"do_gc {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
# remember the number of relations
n_relations = row['n_relations']
assert n_relations > 0
# Insert a row. The first insert will also create a metadata entry for the
# relation, with size == 1 block. Hence, bump up the expected relation count.
n_relations += 1;
print("Inserting one row and running GC")
cur.execute("INSERT INTO foo VALUES (1)")
pscur.execute(f"do_gc {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 30
assert row['deleted'] == 3
# Insert two more rows and run GC.
print("Inserting two more rows and running GC")
cur.execute("INSERT INTO foo VALUES (2)")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 30
assert row['deleted'] == 2
# Insert one more row. It creates one more page version, but doesn't affect the
# relation size.
print("Inserting one more row")
cur.execute("INSERT INTO foo VALUES (3)")
pscur.execute(f"do_gc {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 30
assert row['deleted'] == 1
# Run GC again, with no changes in the database. Should not remove anything.
pscur.execute(f"do_gc {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
assert row['n_relations'] == n_relations
assert row['dropped'] == 0
assert row['truncated'] == 30
assert row['deleted'] == 0
#
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
#
cur.execute("DROP TABLE foo")
pscur.execute(f"do_gc {timeline} 0")
row = pscur.fetchone()
print("GC duration {elapsed} ms, relations: {n_relations}, dropped {dropped}, truncated: {truncated}, deleted: {deleted}".format_map(row))
# Each relation fork is counted separately, hence 3.
assert row['dropped'] == 3

View File

@@ -472,11 +472,11 @@ class WalAcceptor:
cmd = [self.wa_binpath]
cmd.extend(["-D", self.data_dir])
cmd.extend(["-l", "localhost:{}".format(self.port)])
cmd.extend(["-l", "127.0.0.1:{}".format(self.port)])
cmd.append("--daemonize")
cmd.append("--no-sync")
# Tell page server it can receive WAL from this WAL safekeeper
cmd.extend(["--pageserver", "localhost:{}".format(DEFAULT_PAGESERVER_PORT)])
cmd.extend(["--pageserver", "127.0.0.1:{}".format(DEFAULT_PAGESERVER_PORT)])
cmd.extend(["--recall", "1 second"])
print('Running command "{}"'.format(' '.join(cmd)))
subprocess.run(cmd, check=True)
@@ -543,7 +543,7 @@ class WalAcceptorFactory:
def get_connstrs(self) -> str:
""" Get list of wal acceptor endpoints suitable for wal_acceptors GUC """
return ','.join(["localhost:{}".format(wa.port) for wa in self.instances])
return ','.join(["127.0.0.1:{}".format(wa.port) for wa in self.instances])
@zenfixture

View File

@@ -8,6 +8,7 @@ use log::*;
use parse_duration::parse;
use slog::Drain;
use std::io;
use std::net::ToSocketAddrs;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Duration;
@@ -73,7 +74,7 @@ fn main() -> Result<()> {
daemonize: false,
no_sync: false,
pageserver_addr: None,
listen_addr: "localhost:5454".to_string(),
listen_addr: "127.0.0.1:5454".parse()?,
ttl: None,
recall_period: None,
};
@@ -94,11 +95,17 @@ fn main() -> Result<()> {
}
if let Some(addr) = arg_matches.value_of("listen") {
conf.listen_addr = addr.to_owned();
// TODO: keep addr vector in config and listen them all
// XXX: with our callmemaybe approach we need to set 'advertised address'
// as it is not always possible to listen it. Another reason to ditch callmemaybe.
let addrs: Vec<_> = addr.to_socket_addrs().unwrap().collect();
conf.listen_addr = addrs[0];
}
if let Some(addr) = arg_matches.value_of("pageserver") {
conf.pageserver_addr = Some(addr.to_owned());
// TODO: keep addr vector in config and check them all while connecting
let addrs: Vec<_> = addr.to_socket_addrs().unwrap().collect();
conf.pageserver_addr = Some(addrs[0]);
}
if let Some(ttl) = arg_matches.value_of("ttl") {
@@ -188,19 +195,12 @@ fn init_logging(
if conf.daemonize {
let decorator = slog_term::PlainSyncDecorator::new(log_file);
let drain = slog_term::CompactFormat::new(decorator).build();
let drain = slog::Filter::new(drain, |record: &slog::Record| {
record.level().is_at_least(slog::Level::Info)
});
let drain = std::sync::Mutex::new(drain).fuse();
let logger = slog::Logger::root(drain, slog::o!());
Ok(slog_scope::set_global_logger(logger))
} else {
let decorator = slog_term::TermDecorator::new().build();
let drain = slog_term::FullFormat::new(decorator).build().fuse();
let drain = slog::Filter::new(drain, |record: &slog::Record| {
record.level().is_at_least(slog::Level::Info)
})
.fuse();
let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
let logger = slog::Logger::root(drain, slog::o!());
Ok(slog_scope::set_global_logger(logger))

View File

@@ -1,4 +1,5 @@
//
use std::net::SocketAddr;
use std::path::PathBuf;
use std::time::Duration;
@@ -14,8 +15,8 @@ pub struct WalAcceptorConf {
pub data_dir: PathBuf,
pub daemonize: bool,
pub no_sync: bool,
pub listen_addr: String,
pub pageserver_addr: Option<String>,
pub listen_addr: SocketAddr,
pub pageserver_addr: Option<SocketAddr>,
pub ttl: Option<Duration>,
pub recall_period: Option<Duration>,
}

View File

@@ -4,9 +4,8 @@
use anyhow::{bail, Result};
use log::*;
use postgres::{Client, Config, NoTls};
use postgres::{Client, NoTls};
use serde::{Deserialize, Serialize};
use zenith_utils::connstring::connection_host_port;
use std::cmp::{max, min};
use std::fs::{self, File, OpenOptions};
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
@@ -150,18 +149,19 @@ pub struct ReceiveWalConn {
/// If pageserver already has replication channel, it will just ignore this request
///
fn request_callback(conf: WalAcceptorConf, timelineid: ZTimelineId) {
let ps_addr = conf.pageserver_addr.unwrap();
let ps_connstr = format!("postgresql://no_user@{}/no_db", ps_addr);
// use Config parsing because SockAddr parsing doesnt allow to use host names instead of ip addresses
let me_connstr = format!("postgresql://no_user@{}/no_db", conf.listen_addr);
let me_conf: Config = me_connstr.parse().unwrap();
let (host, port) = connection_host_port(&me_conf);
let addr = conf.pageserver_addr.unwrap();
let ps_connstr = format!(
"host={} port={} dbname={} user={}",
addr.ip(),
addr.port(),
"no_db",
"no_user",
);
let callme = format!(
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
timelineid,
host,
port,
conf.listen_addr.ip(),
conf.listen_addr.port(),
timelineid
);
loop {
@@ -241,6 +241,9 @@ impl ReceiveWalConn {
my_info.server = server_info.clone();
my_info.server.node_id = node_id;
/* Need to save incompleted my_info in timeline to provide wal_seg_size for find_end_of_wal */
self.timeline.get().set_info(&my_info);
/* Calculate WAL end based on local data */
let (flush_lsn, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, true);
my_info.flush_lsn = flush_lsn;

View File

@@ -63,11 +63,15 @@ impl ReplicationConn {
let feedback = HotStandbyFeedback::des(&m)?;
timeline.add_hs_feedback(feedback)
}
msg => {
None => {
break;
}
Some(msg) => {
info!("unexpected message {:?}", msg);
}
}
}
Err(anyhow!("Connection closed"))
}
/// Helper function that parses a pair of LSNs.

View File

@@ -232,6 +232,7 @@ impl TimelineTools for Option<Arc<Timeline>> {
/// Find last WAL record. If "precise" is false then just locate last partial segment
fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID) {
let seg_size = self.get().get_info().server.wal_seg_size as usize;
assert!(seg_size > 0);
let (lsn, timeline) = find_end_of_wal(data_dir, seg_size, precise);
(Lsn(lsn), timeline)
}

View File

@@ -16,7 +16,7 @@ use zenith_utils::postgres_backend::PostgresBackend;
/// Accept incoming TCP connections and spawn them into a background thread.
pub fn thread_main(conf: WalAcceptorConf) -> Result<()> {
info!("Starting wal acceptor on {}", conf.listen_addr);
let listener = TcpListener::bind(conf.listen_addr.clone()).map_err(|e| {
let listener = TcpListener::bind(conf.listen_addr).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_addr, e);
e
})?;

View File

@@ -12,7 +12,6 @@ log = "0.4.14"
serde = { version = "1.0", features = ["derive"] }
bincode = "1.3"
thiserror = "1.0"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
workspace_hack = { path = "../workspace_hack" }
[dev-dependencies]

View File

@@ -1,34 +0,0 @@
use postgres::Config;
pub fn connection_host_port(config: &Config) -> (String, u16) {
assert_eq!(config.get_hosts().len(), 1, "only one pair of host and port is supported in connection string");
assert_eq!(config.get_ports().len(), 1, "only one pair of host and port is supported in connection string");
let host = match &config.get_hosts()[0] {
postgres::config::Host::Tcp(host) => host.as_ref(),
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
};
(host.to_owned(), config.get_ports()[0])
}
pub fn connection_address(config: &Config) -> String {
let (host, port) = connection_host_port(config);
format!("{}:{}", host, port)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_connection_host_port() {
let config: Config = "postgresql://no_user@localhost:64000/no_db".parse().unwrap();
assert_eq!(connection_host_port(&config), ("localhost".to_owned(), 64000));
}
#[test]
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
fn test_connection_host_port_multiple_ports() {
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db".parse().unwrap();
assert_eq!(connection_host_port(&config), ("localhost".to_owned(), 64000));
}
}

View File

@@ -13,6 +13,3 @@ pub mod bin_ser;
pub mod postgres_backend;
pub mod pq_proto;
// dealing with connstring parsing and handy access to it's parts
pub mod connstring;